This is an automated email from the ASF dual-hosted git repository.

felipecrv pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new e04f5b4b90 GH-41560: [C++] ChunkResolver: Implement ResolveMany and 
add unit tests (#41561)
e04f5b4b90 is described below

commit e04f5b4b905cfc37b5eaeea2c34e51349ae562b9
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Wed May 15 20:59:49 2024 -0300

    GH-41560: [C++] ChunkResolver: Implement ResolveMany and add unit tests 
(#41561)
    
    ### Rationale for this change
    
    I want `ResolveMany` to support me in the implementation of `Take` that 
doesn't `Concatenate` all the chunks from a `ChunkedArray` `values` parameter.
    
    ### What changes are included in this PR?
    
     - Implementation of `ChunkResolver::ResolveMany()`
     - Addition of missing unit tests for `ChunkResolver`
    
    ### Are these changes tested?
    
    Yes. By new unit tests.
    
    ### Are there any user-facing changes?
    
    No. `ChunkResolver` is an internal API at the moment (see #34535 for future 
plans).
    * GitHub Issue: #41560
    
    Authored-by: Felipe Oliveira Carvalho <[email protected]>
    Signed-off-by: Felipe Oliveira Carvalho <[email protected]>
---
 cpp/src/arrow/chunk_resolver.cc              |  80 ++++++++++-
 cpp/src/arrow/chunk_resolver.h               | 128 +++++++++++++++--
 cpp/src/arrow/chunked_array_test.cc          | 200 +++++++++++++++++++++++++++
 cpp/src/arrow/compute/kernels/vector_sort.cc |  30 ++--
 4 files changed, 407 insertions(+), 31 deletions(-)

diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc
index 29bccb5265..55eec53ced 100644
--- a/cpp/src/arrow/chunk_resolver.cc
+++ b/cpp/src/arrow/chunk_resolver.cc
@@ -19,14 +19,14 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <vector>
 
 #include "arrow/array.h"
 #include "arrow/record_batch.h"
 
-namespace arrow {
-namespace internal {
+namespace arrow::internal {
 
 namespace {
 template <typename T>
@@ -54,6 +54,51 @@ inline std::vector<int64_t> MakeChunksOffsets(const 
std::vector<T>& chunks) {
   offsets[chunks.size()] = offset;
   return offsets;
 }
+
+/// \pre all the pre-conditions of ChunkResolver::ResolveMany()
+/// \pre num_offsets - 1 <= std::numeric_limits<IndexType>::max()
+template <typename IndexType>
+void ResolveManyInline(size_t num_offsets, const int64_t* signed_offsets,
+                       int64_t n_indices, const IndexType* logical_index_vec,
+                       IndexType* out_chunk_index_vec, IndexType chunk_hint,
+                       IndexType* out_index_in_chunk_vec) {
+  auto* offsets = reinterpret_cast<const uint64_t*>(signed_offsets);
+  const auto num_chunks = static_cast<IndexType>(num_offsets - 1);
+  // chunk_hint in [0, num_offsets) per the precondition.
+  for (int64_t i = 0; i < n_indices; i++) {
+    const auto index = static_cast<uint64_t>(logical_index_vec[i]);
+    if (index >= offsets[chunk_hint] &&
+        (chunk_hint == num_chunks || index < offsets[chunk_hint + 1])) {
+      out_chunk_index_vec[i] = chunk_hint;  // hint is correct!
+      continue;
+    }
+    // lo < hi is guaranteed by `num_offsets = chunks.size() + 1`
+    auto chunk_index =
+        ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets);
+    chunk_hint = static_cast<IndexType>(chunk_index);
+    out_chunk_index_vec[i] = chunk_hint;
+  }
+  if (out_index_in_chunk_vec != NULLPTR) {
+    for (int64_t i = 0; i < n_indices; i++) {
+      auto logical_index = logical_index_vec[i];
+      auto chunk_index = out_chunk_index_vec[i];
+      // chunk_index is in [0, chunks.size()] no matter what the
+      // value of logical_index is, so it's always safe to dereference
+      // offset_ as it contains chunks.size()+1 values.
+      out_index_in_chunk_vec[i] =
+          logical_index - static_cast<IndexType>(offsets[chunk_index]);
+#if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER)
+      // Make it more likely that Valgrind/ASAN can catch an invalid memory
+      // access by poisoning out_index_in_chunk_vec[i] when the logical
+      // index is out-of-bounds.
+      if (chunk_index == num_chunks) {
+        out_index_in_chunk_vec[i] = std::numeric_limits<IndexType>::max();
+      }
+#endif
+    }
+  }
+}
+
 }  // namespace
 
 ChunkResolver::ChunkResolver(const ArrayVector& chunks) noexcept
@@ -84,5 +129,32 @@ ChunkResolver& ChunkResolver::operator=(const 
ChunkResolver& other) noexcept {
   return *this;
 }
 
-}  // namespace internal
-}  // namespace arrow
+void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint8_t* 
logical_index_vec,
+                                    uint8_t* out_chunk_index_vec, uint8_t 
chunk_hint,
+                                    uint8_t* out_index_in_chunk_vec) const {
+  ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, 
logical_index_vec,
+                    out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+}
+
+void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint32_t* 
logical_index_vec,
+                                    uint32_t* out_chunk_index_vec, uint32_t 
chunk_hint,
+                                    uint32_t* out_index_in_chunk_vec) const {
+  ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, 
logical_index_vec,
+                    out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+}
+
+void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint16_t* 
logical_index_vec,
+                                    uint16_t* out_chunk_index_vec, uint16_t 
chunk_hint,
+                                    uint16_t* out_index_in_chunk_vec) const {
+  ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, 
logical_index_vec,
+                    out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+}
+
+void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* 
logical_index_vec,
+                                    uint64_t* out_chunk_index_vec, uint64_t 
chunk_hint,
+                                    uint64_t* out_index_in_chunk_vec) const {
+  ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, 
logical_index_vec,
+                    out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+}
+
+}  // namespace arrow::internal
diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h
index c5dad1a17b..a2a3d5a864 100644
--- a/cpp/src/arrow/chunk_resolver.h
+++ b/cpp/src/arrow/chunk_resolver.h
@@ -20,6 +20,8 @@
 #include <atomic>
 #include <cassert>
 #include <cstdint>
+#include <limits>
+#include <type_traits>
 #include <vector>
 
 #include "arrow/type_fwd.h"
@@ -27,6 +29,8 @@
 
 namespace arrow::internal {
 
+struct ChunkResolver;
+
 struct ChunkLocation {
   /// \brief Index of the chunk in the array of chunks
   ///
@@ -36,8 +40,17 @@ struct ChunkLocation {
 
   /// \brief Index of the value in the chunk
   ///
-  /// The value is undefined if chunk_index >= chunks.size()
+  /// The value is UNDEFINED if chunk_index >= chunks.size()
   int64_t index_in_chunk = 0;
+
+  ChunkLocation() = default;
+
+  ChunkLocation(int64_t chunk_index, int64_t index_in_chunk)
+      : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {}
+
+  bool operator==(ChunkLocation other) const {
+    return chunk_index == other.chunk_index && index_in_chunk == 
other.index_in_chunk;
+  }
 };
 
 /// \brief An utility that incrementally resolves logical indices into
@@ -60,12 +73,35 @@ struct ARROW_EXPORT ChunkResolver {
   explicit ChunkResolver(const std::vector<const Array*>& chunks) noexcept;
   explicit ChunkResolver(const RecordBatchVector& batches) noexcept;
 
+  /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 
offsets.
+  ///
+  /// The first offset must be 0 and the last offset must be the logical 
length of the
+  /// chunked array. Each offset before the last represents the starting 
logical index of
+  /// the corresponding chunk.
+  explicit ChunkResolver(std::vector<int64_t> offsets) noexcept
+      : offsets_(std::move(offsets)), cached_chunk_(0) {
+#ifndef NDEBUG
+    assert(offsets_.size() >= 1);
+    assert(offsets_[0] == 0);
+    for (size_t i = 1; i < offsets_.size(); i++) {
+      assert(offsets_[i] >= offsets_[i - 1]);
+    }
+#endif
+  }
+
   ChunkResolver(ChunkResolver&& other) noexcept;
   ChunkResolver& operator=(ChunkResolver&& other) noexcept;
 
   ChunkResolver(const ChunkResolver& other) noexcept;
   ChunkResolver& operator=(const ChunkResolver& other) noexcept;
 
+  int64_t logical_array_length() const { return offsets_.back(); }
+  int64_t num_chunks() const { return static_cast<int64_t>(offsets_.size()) - 
1; }
+
+  int64_t chunk_length(int64_t chunk_index) const {
+    return offsets_[chunk_index + 1] - offsets_[chunk_index];
+  }
+
   /// \brief Resolve a logical index to a ChunkLocation.
   ///
   /// The returned ChunkLocation contains the chunk index and the within-chunk 
index
@@ -81,7 +117,7 @@ struct ARROW_EXPORT ChunkResolver {
     const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed);
     const auto chunk_index =
         ResolveChunkIndex</*StoreCachedChunk=*/true>(index, cached_chunk);
-    return {chunk_index, index - offsets_[chunk_index]};
+    return ChunkLocation{chunk_index, index - offsets_[chunk_index]};
   }
 
   /// \brief Resolve a logical index to a ChunkLocation.
@@ -97,12 +133,70 @@ struct ARROW_EXPORT ChunkResolver {
   /// \return ChunkLocation with a valid chunk_index if index is within
   ///         bounds, or with chunk_index == chunks.size() if logical index is
   ///         `>= chunked_array.length()`.
-  inline ChunkLocation ResolveWithChunkIndexHint(int64_t index,
-                                                 ChunkLocation hint) const {
+  inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) 
const {
     assert(hint.chunk_index < static_cast<int64_t>(offsets_.size()));
     const auto chunk_index =
         ResolveChunkIndex</*StoreCachedChunk=*/false>(index, hint.chunk_index);
-    return {chunk_index, index - offsets_[chunk_index]};
+    return ChunkLocation{chunk_index, index - offsets_[chunk_index]};
+  }
+
+  /// \brief Resolve `n_indices` logical indices to chunk indices.
+  ///
+  /// \pre 0 <= logical_index_vec[i] < logical_array_length()
+  ///      (for well-defined and valid chunk index results)
+  /// \pre out_chunk_index_vec has space for `n_indices`
+  /// \pre chunk_hint in [0, chunks.size()]
+  /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n)
+  /// \post if logical_index_vec[i] >= chunked_array.length(), then
+  ///       out_chunk_index_vec[i] == chunks.size()
+  ///       and out_index_in_chunk_vec[i] is UNDEFINED (can be out-of-bounds)
+  /// \post if logical_index_vec[i] < 0, then both out_chunk_index_vec[i] and
+  ///       out_index_in_chunk_vec[i] are UNDEFINED
+  ///
+  /// \param n_indices The number of logical indices to resolve
+  /// \param logical_index_vec The logical indices to resolve
+  /// \param out_chunk_index_vec The output array where the chunk indices will 
be written
+  /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany
+  /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the
+  ///                               within-chunk indices will be written
+  /// \return false iff chunks.size() > std::numeric_limits<IndexType>::max()
+  template <typename IndexType>
+  [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* 
logical_index_vec,
+                                 IndexType* out_chunk_index_vec, IndexType 
chunk_hint = 0,
+                                 IndexType* out_index_in_chunk_vec = NULLPTR) 
const {
+    if constexpr (sizeof(IndexType) < sizeof(uint64_t)) {
+      // The max value returned by Bisect is `offsets.size() - 1` (= 
chunks.size()).
+      constexpr uint64_t kMaxIndexTypeValue = 
std::numeric_limits<IndexType>::max();
+      // A ChunkedArray with enough empty chunks can make the index of a chunk
+      // exceed the logical index and thus the maximum value of IndexType.
+      const bool chunk_index_fits_on_type =
+          static_cast<uint64_t>(offsets_.size() - 1) <= kMaxIndexTypeValue;
+      if (ARROW_PREDICT_FALSE(!chunk_index_fits_on_type)) {
+        return false;
+      }
+      // Since an index-in-chunk cannot possibly exceed the logical index being
+      // queried, we don't have to worry about these values not fitting on 
IndexType.
+    }
+    if constexpr (std::is_signed_v<IndexType>) {
+      // We interpret signed integers as unsigned and avoid having to generate 
double
+      // the amount of binary code to handle each integer width.
+      //
+      // Negative logical indices can become large values when cast to 
unsigned, and
+      // they are gracefully handled by ResolveManyImpl, but both the chunk 
index
+      // and the index in chunk values will be undefined in these cases. This
+      // happend because int8_t(-1) == uint8_t(255) and 255 could be a valid
+      // logical index in the chunked array.
+      using U = std::make_unsigned_t<IndexType>;
+      ResolveManyImpl(n_indices, reinterpret_cast<const U*>(logical_index_vec),
+                      reinterpret_cast<U*>(out_chunk_index_vec),
+                      static_cast<U>(chunk_hint),
+                      reinterpret_cast<U*>(out_index_in_chunk_vec));
+    } else {
+      static_assert(std::is_unsigned_v<IndexType>);
+      ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, 
chunk_hint,
+                      out_index_in_chunk_vec);
+    }
+    return true;
   }
 
  private:
@@ -130,17 +224,33 @@ struct ARROW_EXPORT ChunkResolver {
     return chunk_index;
   }
 
+  /// \pre all the pre-conditions of ChunkResolver::ResolveMany()
+  /// \pre num_offsets - 1 <= std::numeric_limits<IndexType>::max()
+  void ResolveManyImpl(int64_t, const uint8_t*, uint8_t*, uint8_t, uint8_t*) 
const;
+  void ResolveManyImpl(int64_t, const uint16_t*, uint16_t*, uint16_t, 
uint16_t*) const;
+  void ResolveManyImpl(int64_t, const uint32_t*, uint32_t*, uint32_t, 
uint32_t*) const;
+  void ResolveManyImpl(int64_t, const uint64_t*, uint64_t*, uint64_t, 
uint64_t*) const;
+
+ public:
   /// \brief Find the index of the chunk that contains the logical index.
   ///
   /// Any non-negative index is accepted. When `hi=num_offsets`, the largest
   /// possible return value is `num_offsets-1` which is equal to
-  /// `chunks.size()`. The is returned when the logical index is out-of-bounds.
+  /// `chunks.size()`. Which is returned when the logical index is greater or
+  /// equal the logical length of the chunked array.
   ///
-  /// \pre index >= 0
+  /// \pre index >= 0 (otherwise, when index is negative, hi-1 is returned)
   /// \pre lo < hi
   /// \pre lo >= 0 && hi <= offsets_.size()
   static inline int64_t Bisect(int64_t index, const int64_t* offsets, int64_t 
lo,
                                int64_t hi) {
+    return Bisect(static_cast<uint64_t>(index),
+                  reinterpret_cast<const uint64_t*>(offsets), 
static_cast<uint64_t>(lo),
+                  static_cast<uint64_t>(hi));
+  }
+
+  static inline int64_t Bisect(uint64_t index, const uint64_t* offsets, 
uint64_t lo,
+                               uint64_t hi) {
     // Similar to std::upper_bound(), but slightly different as our offsets
     // array always starts with 0.
     auto n = hi - lo;
@@ -148,8 +258,8 @@ struct ARROW_EXPORT ChunkResolver {
     // (lo < hi is guaranteed by the precondition).
     assert(n > 1 && "lo < hi is a precondition of Bisect");
     do {
-      const int64_t m = n >> 1;
-      const int64_t mid = lo + m;
+      const uint64_t m = n >> 1;
+      const uint64_t mid = lo + m;
       if (index >= offsets[mid]) {
         lo = mid;
         n -= m;
diff --git a/cpp/src/arrow/chunked_array_test.cc 
b/cpp/src/arrow/chunked_array_test.cc
index 6ca52ab46c..e9cc283b53 100644
--- a/cpp/src/arrow/chunked_array_test.cc
+++ b/cpp/src/arrow/chunked_array_test.cc
@@ -23,6 +23,7 @@
 #include <memory>
 #include <vector>
 
+#include "arrow/chunk_resolver.h"
 #include "arrow/scalar.h"
 #include "arrow/status.h"
 #include "arrow/testing/builder.h"
@@ -34,6 +35,9 @@
 
 namespace arrow {
 
+using internal::ChunkLocation;
+using internal::ChunkResolver;
+
 class TestChunkedArray : public ::testing::Test {
  protected:
   virtual void Construct() {
@@ -310,4 +314,200 @@ TEST_F(TestChunkedArray, GetScalar) {
   ASSERT_RAISES(IndexError, carr.GetScalar(7));
 }
 
+// ChunkResolver tests
+
+using IndexTypes = ::testing::Types<uint8_t, uint16_t, uint32_t, uint64_t, 
int8_t,
+                                    int16_t, int32_t, int64_t>;
+
+TEST(TestChunkResolver, Resolve) {
+  ChunkResolver empty(std::vector<int64_t>({0}));  // []
+  // ChunkLocation::index_in_chunk is undefined when 
chunk_index==chunks.size(),
+  // so only chunk_index is compared in these cases.
+  ASSERT_EQ(empty.Resolve(0).chunk_index, 0);
+  ASSERT_EQ(empty.Resolve(0).chunk_index, 0);
+
+  ChunkResolver one(std::vector<int64_t>({0, 1}));  // [[0]]
+  ASSERT_EQ(one.Resolve(1).chunk_index, 1);
+  ASSERT_EQ(one.Resolve(0), (ChunkLocation(0, 0)));
+  ASSERT_EQ(one.Resolve(1).chunk_index, 1);
+
+  ChunkResolver one_and_empty(std::vector<int64_t>({0, 1, 1, 1}));  // [[0], 
[], []]
+  ASSERT_EQ(one_and_empty.Resolve(3).chunk_index, 3);
+  ASSERT_EQ(one_and_empty.Resolve(2).chunk_index, 3);
+  ASSERT_EQ(one_and_empty.Resolve(1).chunk_index, 3);
+  ASSERT_EQ(one_and_empty.Resolve(0), (ChunkLocation(0, 0)));
+  ASSERT_EQ(one_and_empty.Resolve(1).chunk_index, 3);
+  ASSERT_EQ(one_and_empty.Resolve(2).chunk_index, 3);
+  ASSERT_EQ(one_and_empty.Resolve(3).chunk_index, 3);
+
+  ChunkResolver one_one_one(std::vector<int64_t>({0, 1, 2, 3}));  // [[0], 
[1], [2]]
+  ASSERT_EQ(one_one_one.Resolve(3).chunk_index, 3);
+  ASSERT_EQ(one_one_one.Resolve(2), (ChunkLocation(2, 0)));
+  ASSERT_EQ(one_one_one.Resolve(1), (ChunkLocation(1, 0)));
+  ASSERT_EQ(one_one_one.Resolve(0), (ChunkLocation(0, 0)));
+  ASSERT_EQ(one_one_one.Resolve(1), (ChunkLocation(1, 0)));
+  ASSERT_EQ(one_one_one.Resolve(2), (ChunkLocation(2, 0)));
+  ASSERT_EQ(one_one_one.Resolve(3).chunk_index, 3);
+
+  ChunkResolver resolver(std::vector<int64_t>({0, 2, 3, 10}));  // [[0, 1], 
[2], [3..9]]
+  ASSERT_EQ(resolver.Resolve(10).chunk_index, 3);
+  ASSERT_EQ(resolver.Resolve(9), (ChunkLocation(2, 6)));
+  ASSERT_EQ(resolver.Resolve(8), (ChunkLocation(2, 5)));
+  ASSERT_EQ(resolver.Resolve(4), (ChunkLocation(2, 1)));
+  ASSERT_EQ(resolver.Resolve(3), (ChunkLocation(2, 0)));
+  ASSERT_EQ(resolver.Resolve(2), (ChunkLocation(1, 0)));
+  ASSERT_EQ(resolver.Resolve(1), (ChunkLocation(0, 1)));
+  ASSERT_EQ(resolver.Resolve(0), (ChunkLocation(0, 0)));
+  ASSERT_EQ(resolver.Resolve(1), (ChunkLocation(0, 1)));
+  ASSERT_EQ(resolver.Resolve(2), (ChunkLocation(1, 0)));
+  ASSERT_EQ(resolver.Resolve(3), (ChunkLocation(2, 0)));
+  ASSERT_EQ(resolver.Resolve(4), (ChunkLocation(2, 1)));
+  ASSERT_EQ(resolver.Resolve(8), (ChunkLocation(2, 5)));
+  ASSERT_EQ(resolver.Resolve(9), (ChunkLocation(2, 6)));
+  ASSERT_EQ(resolver.Resolve(10).chunk_index, 3);
+}
+
+template <typename T>
+class TestChunkResolverMany : public ::testing::Test {
+ public:
+  using IndexType = T;
+
+  Result<std::vector<ChunkLocation>> ResolveMany(
+      const ChunkResolver& resolver, const std::vector<IndexType>& 
logical_index_vec) {
+    const size_t n = logical_index_vec.size();
+    std::vector<IndexType> chunk_index_vec;
+    chunk_index_vec.resize(n);
+    std::vector<IndexType> index_in_chunk_vec;
+    index_in_chunk_vec.resize(n);
+    bool valid = resolver.ResolveMany<IndexType>(
+        static_cast<int64_t>(n), logical_index_vec.data(), 
chunk_index_vec.data(), 0,
+        index_in_chunk_vec.data());
+    if (ARROW_PREDICT_FALSE(!valid)) {
+      return Status::Invalid("index type doesn't fit possible chunk indexes");
+    }
+    std::vector<ChunkLocation> locations;
+    locations.reserve(n);
+    for (size_t i = 0; i < n; i++) {
+      auto chunk_index = static_cast<int64_t>(chunk_index_vec[i]);
+      auto index_in_chunk = static_cast<int64_t>(index_in_chunk_vec[i]);
+      locations.emplace_back(chunk_index, index_in_chunk);
+    }
+    return locations;
+  }
+
+  void CheckResolveMany(const ChunkResolver& resolver,
+                        const std::vector<IndexType>& logical_index_vec) {
+    ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, 
logical_index_vec));
+    EXPECT_EQ(logical_index_vec.size(), locations.size());
+    for (size_t i = 0; i < logical_index_vec.size(); i++) {
+      IndexType logical_index = logical_index_vec[i];
+      const auto expected = resolver.Resolve(logical_index);
+      ASSERT_LE(expected.chunk_index, resolver.num_chunks());
+      if (expected.chunk_index == resolver.num_chunks()) {
+        // index_in_chunk is undefined in this case
+        ASSERT_EQ(locations[i].chunk_index, expected.chunk_index);
+      } else {
+        ASSERT_EQ(locations[i], expected);
+      }
+    }
+  }
+
+  void TestBasics() {
+    std::vector<IndexType> logical_index_vec;
+
+    ChunkResolver empty(std::vector<int64_t>({0}));  // []
+    logical_index_vec = {0, 0};
+    CheckResolveMany(empty, logical_index_vec);
+
+    ChunkResolver one(std::vector<int64_t>({0, 1}));  // [[0]]
+    logical_index_vec = {1, 0, 1};
+    CheckResolveMany(one, logical_index_vec);
+
+    ChunkResolver one_and_empty(std::vector<int64_t>({0, 1, 1, 1}));  // [[0], 
[], []]
+    logical_index_vec = {3, 2, 1, 0, 1, 2, 3};
+    CheckResolveMany(one_and_empty, logical_index_vec);
+
+    ChunkResolver one_one_one(std::vector<int64_t>({0, 1, 2, 3}));  // [[0], 
[1], [2]]
+    logical_index_vec = {3, 2, 1, 0, 1, 2, 3};
+    CheckResolveMany(one_one_one, logical_index_vec);
+
+    ChunkResolver resolver(std::vector<int64_t>({0, 2, 3, 10}));  // [[0, 1], 
[2], [3..9]]
+    logical_index_vec = {10, 9, 8, 4, 3, 2, 1, 0, 1, 2, 3, 4, 8, 9, 10};
+    CheckResolveMany(resolver, logical_index_vec);
+  }
+
+  void TestOutOfBounds() {
+    ChunkResolver resolver(std::vector<int64_t>({0, 2, 3, 10}));  // [[0, 1], 
[2], [3..9]]
+
+    std::vector<IndexType> logical_index_vec = {10, 11, 12, 13, 14, 13, 11, 
10};
+    ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, 
logical_index_vec));
+    EXPECT_EQ(logical_index_vec.size(), locations.size());
+    for (size_t i = 0; i < logical_index_vec.size(); i++) {
+      ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks());
+    }
+
+    if constexpr (std::is_signed_v<IndexType>) {
+      std::vector<IndexType> logical_index_vec = {-1, -2, -3, -4, INT8_MIN};
+
+      ChunkResolver resolver(std::vector<int64_t>({0, 2, 128}));  // [[0, 1], 
[2..127]]
+      ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, 
logical_index_vec));
+      EXPECT_EQ(logical_index_vec.size(), locations.size());
+      for (size_t i = 0; i < logical_index_vec.size(); i++) {
+        // All the negative indices are greater than 
resolver.logical_array_length()-1
+        // when cast to uint8_t.
+        ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks());
+      }
+
+      if constexpr (sizeof(IndexType) == 1) {
+        ChunkResolver resolver(std::vector<int64_t>(
+            {0, 2, 128, 129, 256}));  // [[0, 1], [2..127], [128], [129, 255]]
+        ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, 
logical_index_vec));
+        EXPECT_EQ(logical_index_vec.size(), locations.size());
+        for (size_t i = 0; i < logical_index_vec.size(); i++) {
+          if constexpr (sizeof(IndexType) == 1) {
+            // All the negative 8-bit indices are SMALLER than
+            // resolver.logical_array_length()=256 when cast to 8-bit unsigned 
integers.
+            // So the resolved locations might look valid, but they should not 
be trusted.
+            ASSERT_LT(locations[i].chunk_index, resolver.num_chunks());
+          } else {
+            // All the negative indices are greater than 
resolver.logical_array_length()
+            // when cast to 16/32/64-bit unsigned integers.
+            ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks());
+          }
+        }
+      }
+    }
+  }
+
+  void TestOverflow() {
+    const int64_t kMaxIndex = std::is_signed_v<IndexType> ? 127 : 255;
+    std::vector<IndexType> logical_index_vec = {0, 1, 2,
+                                                
static_cast<IndexType>(kMaxIndex)};
+
+    // Overflows are rare because to make them possible, we need more chunks
+    // than logical elements in the ChunkedArray. That requires at least one
+    // empty chunk.
+    std::vector<int64_t> offsets;
+    for (int64_t i = 0; i <= kMaxIndex; i++) {
+      offsets.push_back(i);
+    }
+    ChunkResolver resolver{offsets};
+    ASSERT_OK(ResolveMany(resolver, logical_index_vec));
+
+    offsets.push_back(kMaxIndex);  // adding an empty chunk
+    ChunkResolver resolver_with_empty{offsets};
+    if (sizeof(IndexType) == 1) {
+      ASSERT_NOT_OK(ResolveMany(resolver_with_empty, logical_index_vec));
+    } else {
+      ASSERT_OK(ResolveMany(resolver_with_empty, logical_index_vec));
+    }
+  }
+};
+
+TYPED_TEST_SUITE(TestChunkResolverMany, IndexTypes);
+
+TYPED_TEST(TestChunkResolverMany, Basics) { this->TestBasics(); }
+TYPED_TEST(TestChunkResolverMany, OutOfBounds) { this->TestOutOfBounds(); }
+TYPED_TEST(TestChunkResolverMany, Overflow) { this->TestOverflow(); }
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc 
b/cpp/src/arrow/compute/kernels/vector_sort.cc
index db2023ef04..ad22fa8d36 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -747,15 +747,13 @@ class TableSorter {
     auto& comparator = comparator_;
     const auto& first_sort_key = sort_keys_[0];
 
-    ChunkLocation left_loc{0, 0};
-    ChunkLocation right_loc{0, 0};
+    ChunkLocation left_loc;
+    ChunkLocation right_loc;
     std::merge(nulls_begin, nulls_middle, nulls_middle, nulls_end, 
temp_indices,
                [&](uint64_t left, uint64_t right) {
                  // First column is either null or nan
-                 left_loc =
-                     left_resolver_.ResolveWithChunkIndexHint(left, 
/*hint=*/left_loc);
-                 right_loc =
-                     right_resolver_.ResolveWithChunkIndexHint(right, 
/*hint=*/right_loc);
+                 left_loc = left_resolver_.ResolveWithHint(left, 
/*hint=*/left_loc);
+                 right_loc = right_resolver_.ResolveWithHint(right, 
/*hint=*/right_loc);
                  auto chunk_left = first_sort_key.GetChunk(left_loc);
                  auto chunk_right = first_sort_key.GetChunk(right_loc);
                  const auto left_is_null = chunk_left.IsNull();
@@ -786,15 +784,13 @@ class TableSorter {
     // Untyped implementation
     auto& comparator = comparator_;
 
-    ChunkLocation left_loc{0, 0};
-    ChunkLocation right_loc{0, 0};
+    ChunkLocation left_loc;
+    ChunkLocation right_loc;
     std::merge(nulls_begin, nulls_middle, nulls_middle, nulls_end, 
temp_indices,
                [&](uint64_t left, uint64_t right) {
                  // First column is always null
-                 left_loc =
-                     left_resolver_.ResolveWithChunkIndexHint(left, 
/*hint=*/left_loc);
-                 right_loc =
-                     right_resolver_.ResolveWithChunkIndexHint(right, 
/*hint=*/right_loc);
+                 left_loc = left_resolver_.ResolveWithHint(left, 
/*hint=*/left_loc);
+                 right_loc = right_resolver_.ResolveWithHint(right, 
/*hint=*/right_loc);
                  return comparator.Compare(left_loc, right_loc, 1);
                });
     // Copy back temp area into main buffer
@@ -812,15 +808,13 @@ class TableSorter {
     auto& comparator = comparator_;
     const auto& first_sort_key = sort_keys_[0];
 
-    ChunkLocation left_loc{0, 0};
-    ChunkLocation right_loc{0, 0};
+    ChunkLocation left_loc;
+    ChunkLocation right_loc;
     std::merge(range_begin, range_middle, range_middle, range_end, 
temp_indices,
                [&](uint64_t left, uint64_t right) {
                  // Both values are never null nor NaN.
-                 left_loc =
-                     left_resolver_.ResolveWithChunkIndexHint(left, 
/*hint=*/left_loc);
-                 right_loc =
-                     right_resolver_.ResolveWithChunkIndexHint(right, 
/*hint=*/right_loc);
+                 left_loc = left_resolver_.ResolveWithHint(left, 
/*hint=*/left_loc);
+                 right_loc = right_resolver_.ResolveWithHint(right, 
/*hint=*/right_loc);
                  auto chunk_left = first_sort_key.GetChunk(left_loc);
                  auto chunk_right = first_sort_key.GetChunk(right_loc);
                  DCHECK(!chunk_left.IsNull());

Reply via email to