felipecrv commented on code in PR #43772:
URL: https://github.com/apache/arrow/pull/43772#discussion_r1723735409


##########
cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc:
##########
@@ -198,29 +209,41 @@ struct TakeBenchmark {
     for (auto _ : state) {
       ABORT_NOT_OK(Take(values, indices).status());
     }
-    state.SetItemsProcessed(state.iterations() * values->length());
+    state.SetItemsProcessed(state.iterations() * num_indices);
+    state.counters["selection_factor"] = selection_factor;
   }
 
   void BenchChunked(const std::shared_ptr<ChunkedArray>& values, bool 
chunk_indices_too) {
     double indices_null_proportion = indices_have_nulls ? args.null_proportion 
: 0;
-    auto indices =
-        rand.Int32(values->length(), 0, static_cast<int32_t>(values->length() 
- 1),
-                   indices_null_proportion);
+    const int64_t num_indices = static_cast<int64_t>(selection_factor * 
values->length());
+    auto indices = rand.Int32(num_indices, 0, 
static_cast<int32_t>(values->length() - 1),
+                              indices_null_proportion);
 
     if (monotonic_indices) {
       auto arg_sorter = *SortIndices(*indices);
       indices = *Take(*indices, *arg_sorter);
     }
     std::shared_ptr<ChunkedArray> chunked_indices;
     if (chunk_indices_too) {
+      // Here we choose for indices chunks to have roughly the same length
+      // as values chunks, but there may be less of them if selection_factor < 
1.0.
+      // The alternative is to have the same number of chunks, but with a 
potentially
+      // much smaller (and irrealistic) length.
       std::vector<std::shared_ptr<Array>> indices_chunks;
       int64_t offset = 0;
       for (int i = 0; i < values->num_chunks(); ++i) {
-        auto chunk = indices->Slice(offset, values->chunk(i)->length());
+        const auto chunk_length = values->chunk(i)->length();

Review Comment:
   `chunk_length = min(values->chunk(i)->length(), max_indices_chunk_length)` 
where
   
   `max_indices_chunk_length = (indices->length() - 1) / 2 + 1`
   
   to guarantee no chunk is longer than half the indices and consequently at 
least 2 chunks of indices.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to