felipecrv commented on code in PR #39566:
URL: https://github.com/apache/arrow/pull/39566#discussion_r1467166688
##########
cpp/src/arrow/chunk_resolver.h:
##########
@@ -57,9 +60,14 @@ struct ARROW_EXPORT ChunkResolver {
// This is trivial when merging (assuming each side of the merge uses
// its own resolver), but also in the inner recursive invocations of
// partitioning.
+ if (index < 0 || index >= max_index_) {
+ return {-1, -1};
+ }
Review Comment:
Returning `offsets_.size()` as the `chunk_index` to indicate the index was
not found is better:
1) `upper_bound` implementation can handle it naturally without an extra
branch
2) the `.end()` iterator is a common way of representing that something was
not found. Iterators, like indices represent the `[0, length)` range -- `begin
= 0, end = length`
3) the negative value could be cast to unsigned and create annoying overflows
```diff
diff --git a/cpp/src/arrow/chunk_resolver.cc
b/cpp/src/arrow/chunk_resolver.cc
index 45084f8f5..4a1ba6d0a 100644
--- a/cpp/src/arrow/chunk_resolver.cc
+++ b/cpp/src/arrow/chunk_resolver.cc
@@ -57,19 +57,13 @@ inline std::vector<int64_t> MakeChunksOffsets(const
std::vector<T>& chunks) {
} // namespace
ChunkResolver::ChunkResolver(const ArrayVector& chunks)
- : offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {
- max_index_ = offsets_.back();
-}
+ : offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {}
ChunkResolver::ChunkResolver(const std::vector<const Array*>& chunks)
- : offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {
- max_index_ = offsets_.back();
-}
+ : offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {}
ChunkResolver::ChunkResolver(const RecordBatchVector& batches)
- : offsets_(MakeChunksOffsets(batches)), cached_chunk_(0) {
- max_index_ = offsets_.back();
-}
+ : offsets_(MakeChunksOffsets(batches)), cached_chunk_(0) {}
} // namespace internal
} // namespace arrow
diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h
index aab63094d..ff0a0b437 100644
--- a/cpp/src/arrow/chunk_resolver.h
+++ b/cpp/src/arrow/chunk_resolver.h
@@ -40,19 +40,18 @@ struct ARROW_EXPORT ChunkResolver {
explicit ChunkResolver(const RecordBatchVector& batches);
ChunkResolver(ChunkResolver&& other) noexcept
- : offsets_(std::move(other.offsets_)),
- max_index_(other.max_index_),
- cached_chunk_(other.cached_chunk_.load()) {}
+ : offsets_(std::move(other.offsets_)),
cached_chunk_(other.cached_chunk_.load()) {}
ChunkResolver& operator=(ChunkResolver&& other) {
offsets_ = std::move(other.offsets_);
- max_index_ = other.max_index_;
cached_chunk_.store(other.cached_chunk_.load());
return *this;
}
/// \brief Return a ChunkLocation containing the chunk index and in-chunk
value index of
/// the chunked array at logical index
+ ///
+ /// \pre index >= 0
inline ChunkLocation Resolve(const int64_t index) const {
// It is common for the algorithms below to make consecutive accesses at
// a relatively small distance from each other, hence often falling in
@@ -60,14 +59,9 @@ struct ARROW_EXPORT ChunkResolver {
// This is trivial when merging (assuming each side of the merge uses
// its own resolver), but also in the inner recursive invocations of
// partitioning.
- if (index < 0 || index >= max_index_) {
- return {-1, -1};
- }
-
if (offsets_.size() <= 1) {
return {0, index};
}
-
const auto cached_chunk = cached_chunk_.load();
const bool cache_hit =
(index >= offsets_[cached_chunk] && index < offsets_[cached_chunk +
1]);
@@ -103,9 +97,6 @@ struct ARROW_EXPORT ChunkResolver {
// Collection of starting offsets used for binary search
std::vector<int64_t> offsets_;
- // Latest valid index
- int64_t max_index_;
-
// Tracks the most recently used chunk index to allow fast
// access for consecutive indices corresponding to the same chunk
mutable std::atomic<int64_t> cached_chunk_;
diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc
index 6a2911b90..c36b736d5 100644
--- a/cpp/src/arrow/chunked_array.cc
+++ b/cpp/src/arrow/chunked_array.cc
@@ -169,7 +169,7 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray&
other,
Result<std::shared_ptr<Scalar>> ChunkedArray::GetScalar(int64_t index)
const {
const auto loc = chunk_resolver_.Resolve(index);
- if (loc.chunk_index == -1) {
+ if (loc.chunk_index >= static_cast<int64_t>(chunks_.size())) {
return Status::IndexError("index with value of ", index,
" is out-of-bounds for chunked array of
length ", length_);
}
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index 3eefab610..f8e4d78fe 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -738,9 +738,7 @@ Result<std::shared_ptr<ChunkedArray>> TakeCA(const
ChunkedArray& values,
ChunkLocation resolved_index = index_resolver.Resolve(index);
int64_t chunk_index = resolved_index.chunk_index;
- if (chunk_index < 0) {
- // ChunkResolver doesn't throw errors when the index is out of
bounds
- // it will just return a chunk index that doesn't exist.
+ if (chunk_index >= num_chunks) {
return Status::IndexError("Index ", index, " is out of bounds");
}
indices_chunks[requested_index] = chunk_index;
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]