This is an automated email from the ASF dual-hosted git repository.
felipecrv pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new e04f5b4b90 GH-41560: [C++] ChunkResolver: Implement ResolveMany and
add unit tests (#41561)
e04f5b4b90 is described below
commit e04f5b4b905cfc37b5eaeea2c34e51349ae562b9
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Wed May 15 20:59:49 2024 -0300
GH-41560: [C++] ChunkResolver: Implement ResolveMany and add unit tests
(#41561)
### Rationale for this change
I want `ResolveMany` to support me in the implementation of `Take` that
doesn't `Concatenate` all the chunks from a `ChunkedArray` `values` parameter.
### What changes are included in this PR?
- Implementation of `ChunkResolver::ResolveMany()`
- Addition of missing unit tests for `ChunkResolver`
### Are these changes tested?
Yes. By new unit tests.
### Are there any user-facing changes?
No. `ChunkResolver` is an internal API at the moment (see #34535 for future
plans).
* GitHub Issue: #41560
Authored-by: Felipe Oliveira Carvalho <[email protected]>
Signed-off-by: Felipe Oliveira Carvalho <[email protected]>
---
cpp/src/arrow/chunk_resolver.cc | 80 ++++++++++-
cpp/src/arrow/chunk_resolver.h | 128 +++++++++++++++--
cpp/src/arrow/chunked_array_test.cc | 200 +++++++++++++++++++++++++++
cpp/src/arrow/compute/kernels/vector_sort.cc | 30 ++--
4 files changed, 407 insertions(+), 31 deletions(-)
diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc
index 29bccb5265..55eec53ced 100644
--- a/cpp/src/arrow/chunk_resolver.cc
+++ b/cpp/src/arrow/chunk_resolver.cc
@@ -19,14 +19,14 @@
#include <algorithm>
#include <cstdint>
+#include <limits>
#include <memory>
#include <vector>
#include "arrow/array.h"
#include "arrow/record_batch.h"
-namespace arrow {
-namespace internal {
+namespace arrow::internal {
namespace {
template <typename T>
@@ -54,6 +54,51 @@ inline std::vector<int64_t> MakeChunksOffsets(const
std::vector<T>& chunks) {
offsets[chunks.size()] = offset;
return offsets;
}
+
+/// \pre all the pre-conditions of ChunkResolver::ResolveMany()
+/// \pre num_offsets - 1 <= std::numeric_limits<IndexType>::max()
+template <typename IndexType>
+void ResolveManyInline(size_t num_offsets, const int64_t* signed_offsets,
+ int64_t n_indices, const IndexType* logical_index_vec,
+ IndexType* out_chunk_index_vec, IndexType chunk_hint,
+ IndexType* out_index_in_chunk_vec) {
+ auto* offsets = reinterpret_cast<const uint64_t*>(signed_offsets);
+ const auto num_chunks = static_cast<IndexType>(num_offsets - 1);
+ // chunk_hint in [0, num_offsets) per the precondition.
+ for (int64_t i = 0; i < n_indices; i++) {
+ const auto index = static_cast<uint64_t>(logical_index_vec[i]);
+ if (index >= offsets[chunk_hint] &&
+ (chunk_hint == num_chunks || index < offsets[chunk_hint + 1])) {
+ out_chunk_index_vec[i] = chunk_hint; // hint is correct!
+ continue;
+ }
+ // lo < hi is guaranteed by `num_offsets = chunks.size() + 1`
+ auto chunk_index =
+ ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets);
+ chunk_hint = static_cast<IndexType>(chunk_index);
+ out_chunk_index_vec[i] = chunk_hint;
+ }
+ if (out_index_in_chunk_vec != NULLPTR) {
+ for (int64_t i = 0; i < n_indices; i++) {
+ auto logical_index = logical_index_vec[i];
+ auto chunk_index = out_chunk_index_vec[i];
+ // chunk_index is in [0, chunks.size()] no matter what the
+ // value of logical_index is, so it's always safe to dereference
+ // offset_ as it contains chunks.size()+1 values.
+ out_index_in_chunk_vec[i] =
+ logical_index - static_cast<IndexType>(offsets[chunk_index]);
+#if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER)
+ // Make it more likely that Valgrind/ASAN can catch an invalid memory
+ // access by poisoning out_index_in_chunk_vec[i] when the logical
+ // index is out-of-bounds.
+ if (chunk_index == num_chunks) {
+ out_index_in_chunk_vec[i] = std::numeric_limits<IndexType>::max();
+ }
+#endif
+ }
+ }
+}
+
} // namespace
ChunkResolver::ChunkResolver(const ArrayVector& chunks) noexcept
@@ -84,5 +129,32 @@ ChunkResolver& ChunkResolver::operator=(const
ChunkResolver& other) noexcept {
return *this;
}
-} // namespace internal
-} // namespace arrow
+void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint8_t*
logical_index_vec,
+ uint8_t* out_chunk_index_vec, uint8_t
chunk_hint,
+ uint8_t* out_index_in_chunk_vec) const {
+ ResolveManyInline(offsets_.size(), offsets_.data(), n_indices,
logical_index_vec,
+ out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+}
+
+void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint32_t*
logical_index_vec,
+ uint32_t* out_chunk_index_vec, uint32_t
chunk_hint,
+ uint32_t* out_index_in_chunk_vec) const {
+ ResolveManyInline(offsets_.size(), offsets_.data(), n_indices,
logical_index_vec,
+ out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+}
+
+void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint16_t*
logical_index_vec,
+ uint16_t* out_chunk_index_vec, uint16_t
chunk_hint,
+ uint16_t* out_index_in_chunk_vec) const {
+ ResolveManyInline(offsets_.size(), offsets_.data(), n_indices,
logical_index_vec,
+ out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+}
+
+void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t*
logical_index_vec,
+ uint64_t* out_chunk_index_vec, uint64_t
chunk_hint,
+ uint64_t* out_index_in_chunk_vec) const {
+ ResolveManyInline(offsets_.size(), offsets_.data(), n_indices,
logical_index_vec,
+ out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec);
+}
+
+} // namespace arrow::internal
diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h
index c5dad1a17b..a2a3d5a864 100644
--- a/cpp/src/arrow/chunk_resolver.h
+++ b/cpp/src/arrow/chunk_resolver.h
@@ -20,6 +20,8 @@
#include <atomic>
#include <cassert>
#include <cstdint>
+#include <limits>
+#include <type_traits>
#include <vector>
#include "arrow/type_fwd.h"
@@ -27,6 +29,8 @@
namespace arrow::internal {
+struct ChunkResolver;
+
struct ChunkLocation {
/// \brief Index of the chunk in the array of chunks
///
@@ -36,8 +40,17 @@ struct ChunkLocation {
/// \brief Index of the value in the chunk
///
- /// The value is undefined if chunk_index >= chunks.size()
+ /// The value is UNDEFINED if chunk_index >= chunks.size()
int64_t index_in_chunk = 0;
+
+ ChunkLocation() = default;
+
+ ChunkLocation(int64_t chunk_index, int64_t index_in_chunk)
+ : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {}
+
+ bool operator==(ChunkLocation other) const {
+ return chunk_index == other.chunk_index && index_in_chunk ==
other.index_in_chunk;
+ }
};
/// \brief An utility that incrementally resolves logical indices into
@@ -60,12 +73,35 @@ struct ARROW_EXPORT ChunkResolver {
explicit ChunkResolver(const std::vector<const Array*>& chunks) noexcept;
explicit ChunkResolver(const RecordBatchVector& batches) noexcept;
+ /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1
offsets.
+ ///
+ /// The first offset must be 0 and the last offset must be the logical
length of the
+ /// chunked array. Each offset before the last represents the starting
logical index of
+ /// the corresponding chunk.
+ explicit ChunkResolver(std::vector<int64_t> offsets) noexcept
+ : offsets_(std::move(offsets)), cached_chunk_(0) {
+#ifndef NDEBUG
+ assert(offsets_.size() >= 1);
+ assert(offsets_[0] == 0);
+ for (size_t i = 1; i < offsets_.size(); i++) {
+ assert(offsets_[i] >= offsets_[i - 1]);
+ }
+#endif
+ }
+
ChunkResolver(ChunkResolver&& other) noexcept;
ChunkResolver& operator=(ChunkResolver&& other) noexcept;
ChunkResolver(const ChunkResolver& other) noexcept;
ChunkResolver& operator=(const ChunkResolver& other) noexcept;
+ int64_t logical_array_length() const { return offsets_.back(); }
+ int64_t num_chunks() const { return static_cast<int64_t>(offsets_.size()) -
1; }
+
+ int64_t chunk_length(int64_t chunk_index) const {
+ return offsets_[chunk_index + 1] - offsets_[chunk_index];
+ }
+
/// \brief Resolve a logical index to a ChunkLocation.
///
/// The returned ChunkLocation contains the chunk index and the within-chunk
index
@@ -81,7 +117,7 @@ struct ARROW_EXPORT ChunkResolver {
const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed);
const auto chunk_index =
ResolveChunkIndex</*StoreCachedChunk=*/true>(index, cached_chunk);
- return {chunk_index, index - offsets_[chunk_index]};
+ return ChunkLocation{chunk_index, index - offsets_[chunk_index]};
}
/// \brief Resolve a logical index to a ChunkLocation.
@@ -97,12 +133,70 @@ struct ARROW_EXPORT ChunkResolver {
/// \return ChunkLocation with a valid chunk_index if index is within
/// bounds, or with chunk_index == chunks.size() if logical index is
/// `>= chunked_array.length()`.
- inline ChunkLocation ResolveWithChunkIndexHint(int64_t index,
- ChunkLocation hint) const {
+ inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint)
const {
assert(hint.chunk_index < static_cast<int64_t>(offsets_.size()));
const auto chunk_index =
ResolveChunkIndex</*StoreCachedChunk=*/false>(index, hint.chunk_index);
- return {chunk_index, index - offsets_[chunk_index]};
+ return ChunkLocation{chunk_index, index - offsets_[chunk_index]};
+ }
+
+ /// \brief Resolve `n_indices` logical indices to chunk indices.
+ ///
+ /// \pre 0 <= logical_index_vec[i] < logical_array_length()
+ /// (for well-defined and valid chunk index results)
+ /// \pre out_chunk_index_vec has space for `n_indices`
+ /// \pre chunk_hint in [0, chunks.size()]
+ /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n)
+ /// \post if logical_index_vec[i] >= chunked_array.length(), then
+ /// out_chunk_index_vec[i] == chunks.size()
+ /// and out_index_in_chunk_vec[i] is UNDEFINED (can be out-of-bounds)
+ /// \post if logical_index_vec[i] < 0, then both out_chunk_index_vec[i] and
+ /// out_index_in_chunk_vec[i] are UNDEFINED
+ ///
+ /// \param n_indices The number of logical indices to resolve
+ /// \param logical_index_vec The logical indices to resolve
+ /// \param out_chunk_index_vec The output array where the chunk indices will
be written
+ /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany
+ /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the
+ /// within-chunk indices will be written
+ /// \return false iff chunks.size() > std::numeric_limits<IndexType>::max()
+ template <typename IndexType>
+ [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType*
logical_index_vec,
+ IndexType* out_chunk_index_vec, IndexType
chunk_hint = 0,
+ IndexType* out_index_in_chunk_vec = NULLPTR)
const {
+ if constexpr (sizeof(IndexType) < sizeof(uint64_t)) {
+ // The max value returned by Bisect is `offsets.size() - 1` (=
chunks.size()).
+ constexpr uint64_t kMaxIndexTypeValue =
std::numeric_limits<IndexType>::max();
+ // A ChunkedArray with enough empty chunks can make the index of a chunk
+ // exceed the logical index and thus the maximum value of IndexType.
+ const bool chunk_index_fits_on_type =
+ static_cast<uint64_t>(offsets_.size() - 1) <= kMaxIndexTypeValue;
+ if (ARROW_PREDICT_FALSE(!chunk_index_fits_on_type)) {
+ return false;
+ }
+ // Since an index-in-chunk cannot possibly exceed the logical index being
+ // queried, we don't have to worry about these values not fitting on
IndexType.
+ }
+ if constexpr (std::is_signed_v<IndexType>) {
+ // We interpret signed integers as unsigned and avoid having to generate
double
+ // the amount of binary code to handle each integer width.
+ //
+ // Negative logical indices can become large values when cast to
unsigned, and
+ // they are gracefully handled by ResolveManyImpl, but both the chunk
index
+ // and the index in chunk values will be undefined in these cases. This
+ // happend because int8_t(-1) == uint8_t(255) and 255 could be a valid
+ // logical index in the chunked array.
+ using U = std::make_unsigned_t<IndexType>;
+ ResolveManyImpl(n_indices, reinterpret_cast<const U*>(logical_index_vec),
+ reinterpret_cast<U*>(out_chunk_index_vec),
+ static_cast<U>(chunk_hint),
+ reinterpret_cast<U*>(out_index_in_chunk_vec));
+ } else {
+ static_assert(std::is_unsigned_v<IndexType>);
+ ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec,
chunk_hint,
+ out_index_in_chunk_vec);
+ }
+ return true;
}
private:
@@ -130,17 +224,33 @@ struct ARROW_EXPORT ChunkResolver {
return chunk_index;
}
+ /// \pre all the pre-conditions of ChunkResolver::ResolveMany()
+ /// \pre num_offsets - 1 <= std::numeric_limits<IndexType>::max()
+ void ResolveManyImpl(int64_t, const uint8_t*, uint8_t*, uint8_t, uint8_t*)
const;
+ void ResolveManyImpl(int64_t, const uint16_t*, uint16_t*, uint16_t,
uint16_t*) const;
+ void ResolveManyImpl(int64_t, const uint32_t*, uint32_t*, uint32_t,
uint32_t*) const;
+ void ResolveManyImpl(int64_t, const uint64_t*, uint64_t*, uint64_t,
uint64_t*) const;
+
+ public:
/// \brief Find the index of the chunk that contains the logical index.
///
/// Any non-negative index is accepted. When `hi=num_offsets`, the largest
/// possible return value is `num_offsets-1` which is equal to
- /// `chunks.size()`. The is returned when the logical index is out-of-bounds.
+ /// `chunks.size()`. Which is returned when the logical index is greater or
+ /// equal the logical length of the chunked array.
///
- /// \pre index >= 0
+ /// \pre index >= 0 (otherwise, when index is negative, hi-1 is returned)
/// \pre lo < hi
/// \pre lo >= 0 && hi <= offsets_.size()
static inline int64_t Bisect(int64_t index, const int64_t* offsets, int64_t
lo,
int64_t hi) {
+ return Bisect(static_cast<uint64_t>(index),
+ reinterpret_cast<const uint64_t*>(offsets),
static_cast<uint64_t>(lo),
+ static_cast<uint64_t>(hi));
+ }
+
+ static inline int64_t Bisect(uint64_t index, const uint64_t* offsets,
uint64_t lo,
+ uint64_t hi) {
// Similar to std::upper_bound(), but slightly different as our offsets
// array always starts with 0.
auto n = hi - lo;
@@ -148,8 +258,8 @@ struct ARROW_EXPORT ChunkResolver {
// (lo < hi is guaranteed by the precondition).
assert(n > 1 && "lo < hi is a precondition of Bisect");
do {
- const int64_t m = n >> 1;
- const int64_t mid = lo + m;
+ const uint64_t m = n >> 1;
+ const uint64_t mid = lo + m;
if (index >= offsets[mid]) {
lo = mid;
n -= m;
diff --git a/cpp/src/arrow/chunked_array_test.cc
b/cpp/src/arrow/chunked_array_test.cc
index 6ca52ab46c..e9cc283b53 100644
--- a/cpp/src/arrow/chunked_array_test.cc
+++ b/cpp/src/arrow/chunked_array_test.cc
@@ -23,6 +23,7 @@
#include <memory>
#include <vector>
+#include "arrow/chunk_resolver.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/testing/builder.h"
@@ -34,6 +35,9 @@
namespace arrow {
+using internal::ChunkLocation;
+using internal::ChunkResolver;
+
class TestChunkedArray : public ::testing::Test {
protected:
virtual void Construct() {
@@ -310,4 +314,200 @@ TEST_F(TestChunkedArray, GetScalar) {
ASSERT_RAISES(IndexError, carr.GetScalar(7));
}
+// ChunkResolver tests
+
+using IndexTypes = ::testing::Types<uint8_t, uint16_t, uint32_t, uint64_t,
int8_t,
+ int16_t, int32_t, int64_t>;
+
+TEST(TestChunkResolver, Resolve) {
+ ChunkResolver empty(std::vector<int64_t>({0})); // []
+ // ChunkLocation::index_in_chunk is undefined when
chunk_index==chunks.size(),
+ // so only chunk_index is compared in these cases.
+ ASSERT_EQ(empty.Resolve(0).chunk_index, 0);
+ ASSERT_EQ(empty.Resolve(0).chunk_index, 0);
+
+ ChunkResolver one(std::vector<int64_t>({0, 1})); // [[0]]
+ ASSERT_EQ(one.Resolve(1).chunk_index, 1);
+ ASSERT_EQ(one.Resolve(0), (ChunkLocation(0, 0)));
+ ASSERT_EQ(one.Resolve(1).chunk_index, 1);
+
+ ChunkResolver one_and_empty(std::vector<int64_t>({0, 1, 1, 1})); // [[0],
[], []]
+ ASSERT_EQ(one_and_empty.Resolve(3).chunk_index, 3);
+ ASSERT_EQ(one_and_empty.Resolve(2).chunk_index, 3);
+ ASSERT_EQ(one_and_empty.Resolve(1).chunk_index, 3);
+ ASSERT_EQ(one_and_empty.Resolve(0), (ChunkLocation(0, 0)));
+ ASSERT_EQ(one_and_empty.Resolve(1).chunk_index, 3);
+ ASSERT_EQ(one_and_empty.Resolve(2).chunk_index, 3);
+ ASSERT_EQ(one_and_empty.Resolve(3).chunk_index, 3);
+
+ ChunkResolver one_one_one(std::vector<int64_t>({0, 1, 2, 3})); // [[0],
[1], [2]]
+ ASSERT_EQ(one_one_one.Resolve(3).chunk_index, 3);
+ ASSERT_EQ(one_one_one.Resolve(2), (ChunkLocation(2, 0)));
+ ASSERT_EQ(one_one_one.Resolve(1), (ChunkLocation(1, 0)));
+ ASSERT_EQ(one_one_one.Resolve(0), (ChunkLocation(0, 0)));
+ ASSERT_EQ(one_one_one.Resolve(1), (ChunkLocation(1, 0)));
+ ASSERT_EQ(one_one_one.Resolve(2), (ChunkLocation(2, 0)));
+ ASSERT_EQ(one_one_one.Resolve(3).chunk_index, 3);
+
+ ChunkResolver resolver(std::vector<int64_t>({0, 2, 3, 10})); // [[0, 1],
[2], [3..9]]
+ ASSERT_EQ(resolver.Resolve(10).chunk_index, 3);
+ ASSERT_EQ(resolver.Resolve(9), (ChunkLocation(2, 6)));
+ ASSERT_EQ(resolver.Resolve(8), (ChunkLocation(2, 5)));
+ ASSERT_EQ(resolver.Resolve(4), (ChunkLocation(2, 1)));
+ ASSERT_EQ(resolver.Resolve(3), (ChunkLocation(2, 0)));
+ ASSERT_EQ(resolver.Resolve(2), (ChunkLocation(1, 0)));
+ ASSERT_EQ(resolver.Resolve(1), (ChunkLocation(0, 1)));
+ ASSERT_EQ(resolver.Resolve(0), (ChunkLocation(0, 0)));
+ ASSERT_EQ(resolver.Resolve(1), (ChunkLocation(0, 1)));
+ ASSERT_EQ(resolver.Resolve(2), (ChunkLocation(1, 0)));
+ ASSERT_EQ(resolver.Resolve(3), (ChunkLocation(2, 0)));
+ ASSERT_EQ(resolver.Resolve(4), (ChunkLocation(2, 1)));
+ ASSERT_EQ(resolver.Resolve(8), (ChunkLocation(2, 5)));
+ ASSERT_EQ(resolver.Resolve(9), (ChunkLocation(2, 6)));
+ ASSERT_EQ(resolver.Resolve(10).chunk_index, 3);
+}
+
+template <typename T>
+class TestChunkResolverMany : public ::testing::Test {
+ public:
+ using IndexType = T;
+
+ Result<std::vector<ChunkLocation>> ResolveMany(
+ const ChunkResolver& resolver, const std::vector<IndexType>&
logical_index_vec) {
+ const size_t n = logical_index_vec.size();
+ std::vector<IndexType> chunk_index_vec;
+ chunk_index_vec.resize(n);
+ std::vector<IndexType> index_in_chunk_vec;
+ index_in_chunk_vec.resize(n);
+ bool valid = resolver.ResolveMany<IndexType>(
+ static_cast<int64_t>(n), logical_index_vec.data(),
chunk_index_vec.data(), 0,
+ index_in_chunk_vec.data());
+ if (ARROW_PREDICT_FALSE(!valid)) {
+ return Status::Invalid("index type doesn't fit possible chunk indexes");
+ }
+ std::vector<ChunkLocation> locations;
+ locations.reserve(n);
+ for (size_t i = 0; i < n; i++) {
+ auto chunk_index = static_cast<int64_t>(chunk_index_vec[i]);
+ auto index_in_chunk = static_cast<int64_t>(index_in_chunk_vec[i]);
+ locations.emplace_back(chunk_index, index_in_chunk);
+ }
+ return locations;
+ }
+
+ void CheckResolveMany(const ChunkResolver& resolver,
+ const std::vector<IndexType>& logical_index_vec) {
+ ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver,
logical_index_vec));
+ EXPECT_EQ(logical_index_vec.size(), locations.size());
+ for (size_t i = 0; i < logical_index_vec.size(); i++) {
+ IndexType logical_index = logical_index_vec[i];
+ const auto expected = resolver.Resolve(logical_index);
+ ASSERT_LE(expected.chunk_index, resolver.num_chunks());
+ if (expected.chunk_index == resolver.num_chunks()) {
+ // index_in_chunk is undefined in this case
+ ASSERT_EQ(locations[i].chunk_index, expected.chunk_index);
+ } else {
+ ASSERT_EQ(locations[i], expected);
+ }
+ }
+ }
+
+ void TestBasics() {
+ std::vector<IndexType> logical_index_vec;
+
+ ChunkResolver empty(std::vector<int64_t>({0})); // []
+ logical_index_vec = {0, 0};
+ CheckResolveMany(empty, logical_index_vec);
+
+ ChunkResolver one(std::vector<int64_t>({0, 1})); // [[0]]
+ logical_index_vec = {1, 0, 1};
+ CheckResolveMany(one, logical_index_vec);
+
+ ChunkResolver one_and_empty(std::vector<int64_t>({0, 1, 1, 1})); // [[0],
[], []]
+ logical_index_vec = {3, 2, 1, 0, 1, 2, 3};
+ CheckResolveMany(one_and_empty, logical_index_vec);
+
+ ChunkResolver one_one_one(std::vector<int64_t>({0, 1, 2, 3})); // [[0],
[1], [2]]
+ logical_index_vec = {3, 2, 1, 0, 1, 2, 3};
+ CheckResolveMany(one_one_one, logical_index_vec);
+
+ ChunkResolver resolver(std::vector<int64_t>({0, 2, 3, 10})); // [[0, 1],
[2], [3..9]]
+ logical_index_vec = {10, 9, 8, 4, 3, 2, 1, 0, 1, 2, 3, 4, 8, 9, 10};
+ CheckResolveMany(resolver, logical_index_vec);
+ }
+
+ void TestOutOfBounds() {
+ ChunkResolver resolver(std::vector<int64_t>({0, 2, 3, 10})); // [[0, 1],
[2], [3..9]]
+
+ std::vector<IndexType> logical_index_vec = {10, 11, 12, 13, 14, 13, 11,
10};
+ ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver,
logical_index_vec));
+ EXPECT_EQ(logical_index_vec.size(), locations.size());
+ for (size_t i = 0; i < logical_index_vec.size(); i++) {
+ ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks());
+ }
+
+ if constexpr (std::is_signed_v<IndexType>) {
+ std::vector<IndexType> logical_index_vec = {-1, -2, -3, -4, INT8_MIN};
+
+ ChunkResolver resolver(std::vector<int64_t>({0, 2, 128})); // [[0, 1],
[2..127]]
+ ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver,
logical_index_vec));
+ EXPECT_EQ(logical_index_vec.size(), locations.size());
+ for (size_t i = 0; i < logical_index_vec.size(); i++) {
+ // All the negative indices are greater than
resolver.logical_array_length()-1
+ // when cast to uint8_t.
+ ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks());
+ }
+
+ if constexpr (sizeof(IndexType) == 1) {
+ ChunkResolver resolver(std::vector<int64_t>(
+ {0, 2, 128, 129, 256})); // [[0, 1], [2..127], [128], [129, 255]]
+ ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver,
logical_index_vec));
+ EXPECT_EQ(logical_index_vec.size(), locations.size());
+ for (size_t i = 0; i < logical_index_vec.size(); i++) {
+ if constexpr (sizeof(IndexType) == 1) {
+ // All the negative 8-bit indices are SMALLER than
+ // resolver.logical_array_length()=256 when cast to 8-bit unsigned
integers.
+ // So the resolved locations might look valid, but they should not
be trusted.
+ ASSERT_LT(locations[i].chunk_index, resolver.num_chunks());
+ } else {
+ // All the negative indices are greater than
resolver.logical_array_length()
+ // when cast to 16/32/64-bit unsigned integers.
+ ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks());
+ }
+ }
+ }
+ }
+ }
+
+ void TestOverflow() {
+ const int64_t kMaxIndex = std::is_signed_v<IndexType> ? 127 : 255;
+ std::vector<IndexType> logical_index_vec = {0, 1, 2,
+
static_cast<IndexType>(kMaxIndex)};
+
+ // Overflows are rare because to make them possible, we need more chunks
+ // than logical elements in the ChunkedArray. That requires at least one
+ // empty chunk.
+ std::vector<int64_t> offsets;
+ for (int64_t i = 0; i <= kMaxIndex; i++) {
+ offsets.push_back(i);
+ }
+ ChunkResolver resolver{offsets};
+ ASSERT_OK(ResolveMany(resolver, logical_index_vec));
+
+ offsets.push_back(kMaxIndex); // adding an empty chunk
+ ChunkResolver resolver_with_empty{offsets};
+ if (sizeof(IndexType) == 1) {
+ ASSERT_NOT_OK(ResolveMany(resolver_with_empty, logical_index_vec));
+ } else {
+ ASSERT_OK(ResolveMany(resolver_with_empty, logical_index_vec));
+ }
+ }
+};
+
+TYPED_TEST_SUITE(TestChunkResolverMany, IndexTypes);
+
+TYPED_TEST(TestChunkResolverMany, Basics) { this->TestBasics(); }
+TYPED_TEST(TestChunkResolverMany, OutOfBounds) { this->TestOutOfBounds(); }
+TYPED_TEST(TestChunkResolverMany, Overflow) { this->TestOverflow(); }
+
} // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc
b/cpp/src/arrow/compute/kernels/vector_sort.cc
index db2023ef04..ad22fa8d36 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -747,15 +747,13 @@ class TableSorter {
auto& comparator = comparator_;
const auto& first_sort_key = sort_keys_[0];
- ChunkLocation left_loc{0, 0};
- ChunkLocation right_loc{0, 0};
+ ChunkLocation left_loc;
+ ChunkLocation right_loc;
std::merge(nulls_begin, nulls_middle, nulls_middle, nulls_end,
temp_indices,
[&](uint64_t left, uint64_t right) {
// First column is either null or nan
- left_loc =
- left_resolver_.ResolveWithChunkIndexHint(left,
/*hint=*/left_loc);
- right_loc =
- right_resolver_.ResolveWithChunkIndexHint(right,
/*hint=*/right_loc);
+ left_loc = left_resolver_.ResolveWithHint(left,
/*hint=*/left_loc);
+ right_loc = right_resolver_.ResolveWithHint(right,
/*hint=*/right_loc);
auto chunk_left = first_sort_key.GetChunk(left_loc);
auto chunk_right = first_sort_key.GetChunk(right_loc);
const auto left_is_null = chunk_left.IsNull();
@@ -786,15 +784,13 @@ class TableSorter {
// Untyped implementation
auto& comparator = comparator_;
- ChunkLocation left_loc{0, 0};
- ChunkLocation right_loc{0, 0};
+ ChunkLocation left_loc;
+ ChunkLocation right_loc;
std::merge(nulls_begin, nulls_middle, nulls_middle, nulls_end,
temp_indices,
[&](uint64_t left, uint64_t right) {
// First column is always null
- left_loc =
- left_resolver_.ResolveWithChunkIndexHint(left,
/*hint=*/left_loc);
- right_loc =
- right_resolver_.ResolveWithChunkIndexHint(right,
/*hint=*/right_loc);
+ left_loc = left_resolver_.ResolveWithHint(left,
/*hint=*/left_loc);
+ right_loc = right_resolver_.ResolveWithHint(right,
/*hint=*/right_loc);
return comparator.Compare(left_loc, right_loc, 1);
});
// Copy back temp area into main buffer
@@ -812,15 +808,13 @@ class TableSorter {
auto& comparator = comparator_;
const auto& first_sort_key = sort_keys_[0];
- ChunkLocation left_loc{0, 0};
- ChunkLocation right_loc{0, 0};
+ ChunkLocation left_loc;
+ ChunkLocation right_loc;
std::merge(range_begin, range_middle, range_middle, range_end,
temp_indices,
[&](uint64_t left, uint64_t right) {
// Both values are never null nor NaN.
- left_loc =
- left_resolver_.ResolveWithChunkIndexHint(left,
/*hint=*/left_loc);
- right_loc =
- right_resolver_.ResolveWithChunkIndexHint(right,
/*hint=*/right_loc);
+ left_loc = left_resolver_.ResolveWithHint(left,
/*hint=*/left_loc);
+ right_loc = right_resolver_.ResolveWithHint(right,
/*hint=*/right_loc);
auto chunk_left = first_sort_key.GetChunk(left_loc);
auto chunk_right = first_sort_key.GetChunk(right_loc);
DCHECK(!chunk_left.IsNull());