felipecrv commented on code in PR #39566:
URL: https://github.com/apache/arrow/pull/39566#discussion_r1467166688


##########
cpp/src/arrow/chunk_resolver.h:
##########
@@ -57,9 +60,14 @@ struct ARROW_EXPORT ChunkResolver {
     // This is trivial when merging (assuming each side of the merge uses
     // its own resolver), but also in the inner recursive invocations of
     // partitioning.
+    if (index < 0 || index >= max_index_) {
+      return {-1, -1};
+    }

Review Comment:
   Returning `offsets_.size()` as the `chunk_index` to indicate the index was 
not found is better:
   1) `upper_bound` implementation can handle it naturally without an extra 
branch
   2) the `.end()` iterator is a common way of representing that something was 
not found. Iterators, like indices represent the `[0, length)` range -- `begin 
= 0, end = length`
   3) the negative value could be cast to unsigned and create annoying overflows
   
   ```diff
   diff --git a/cpp/src/arrow/chunk_resolver.cc 
b/cpp/src/arrow/chunk_resolver.cc
   index 45084f8f5..4a1ba6d0a 100644
   --- a/cpp/src/arrow/chunk_resolver.cc
   +++ b/cpp/src/arrow/chunk_resolver.cc
   @@ -57,19 +57,13 @@ inline std::vector<int64_t> MakeChunksOffsets(const 
std::vector<T>& chunks) {
    }  // namespace
    
    ChunkResolver::ChunkResolver(const ArrayVector& chunks)
   -    : offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {
   -  max_index_ = offsets_.back();
   -}
   +    : offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {}
    
    ChunkResolver::ChunkResolver(const std::vector<const Array*>& chunks)
   -    : offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {
   -  max_index_ = offsets_.back();
   -}
   +    : offsets_(MakeChunksOffsets(chunks)), cached_chunk_(0) {}
    
    ChunkResolver::ChunkResolver(const RecordBatchVector& batches)
   -    : offsets_(MakeChunksOffsets(batches)), cached_chunk_(0) {
   -  max_index_ = offsets_.back();
   -}
   +    : offsets_(MakeChunksOffsets(batches)), cached_chunk_(0) {}
    
    }  // namespace internal
    }  // namespace arrow
   diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h
   index aab63094d..ff0a0b437 100644
   --- a/cpp/src/arrow/chunk_resolver.h
   +++ b/cpp/src/arrow/chunk_resolver.h
   @@ -40,19 +40,18 @@ struct ARROW_EXPORT ChunkResolver {
      explicit ChunkResolver(const RecordBatchVector& batches);
    
      ChunkResolver(ChunkResolver&& other) noexcept
   -      : offsets_(std::move(other.offsets_)),
   -        max_index_(other.max_index_),
   -        cached_chunk_(other.cached_chunk_.load()) {}
   +      : offsets_(std::move(other.offsets_)), 
cached_chunk_(other.cached_chunk_.load()) {}
    
      ChunkResolver& operator=(ChunkResolver&& other) {
        offsets_ = std::move(other.offsets_);
   -    max_index_ = other.max_index_;
        cached_chunk_.store(other.cached_chunk_.load());
        return *this;
      }
    
      /// \brief Return a ChunkLocation containing the chunk index and in-chunk 
value index of
      /// the chunked array at logical index
   +  ///
   +  /// \pre index >= 0
      inline ChunkLocation Resolve(const int64_t index) const {
        // It is common for the algorithms below to make consecutive accesses at
        // a relatively small distance from each other, hence often falling in
   @@ -60,14 +59,9 @@ struct ARROW_EXPORT ChunkResolver {
        // This is trivial when merging (assuming each side of the merge uses
        // its own resolver), but also in the inner recursive invocations of
        // partitioning.
   -    if (index < 0 || index >= max_index_) {
   -      return {-1, -1};
   -    }
   -
        if (offsets_.size() <= 1) {
          return {0, index};
        }
   -
        const auto cached_chunk = cached_chunk_.load();
        const bool cache_hit =
            (index >= offsets_[cached_chunk] && index < offsets_[cached_chunk + 
1]);
   @@ -103,9 +97,6 @@ struct ARROW_EXPORT ChunkResolver {
      // Collection of starting offsets used for binary search
      std::vector<int64_t> offsets_;
    
   -  // Latest valid index
   -  int64_t max_index_;
   -
      // Tracks the most recently used chunk index to allow fast
      // access for consecutive indices corresponding to the same chunk
      mutable std::atomic<int64_t> cached_chunk_;
   diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc
   index 6a2911b90..c36b736d5 100644
   --- a/cpp/src/arrow/chunked_array.cc
   +++ b/cpp/src/arrow/chunked_array.cc
   @@ -169,7 +169,7 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray& 
other,
    
    Result<std::shared_ptr<Scalar>> ChunkedArray::GetScalar(int64_t index) 
const {
      const auto loc = chunk_resolver_.Resolve(index);
   -  if (loc.chunk_index == -1) {
   +  if (loc.chunk_index >= static_cast<int64_t>(chunks_.size())) {
        return Status::IndexError("index with value of ", index,
                                  " is out-of-bounds for chunked array of 
length ", length_);
      }
   diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc 
b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
   index 3eefab610..f8e4d78fe 100644
   --- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
   +++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
   @@ -738,9 +738,7 @@ Result<std::shared_ptr<ChunkedArray>> TakeCA(const 
ChunkedArray& values,
    
          ChunkLocation resolved_index = index_resolver.Resolve(index);
          int64_t chunk_index = resolved_index.chunk_index;
   -      if (chunk_index < 0) {
   -        // ChunkResolver doesn't throw errors when the index is out of 
bounds
   -        // it will just return a chunk index that doesn't exist.
   +      if (chunk_index >= num_chunks) {
            return Status::IndexError("Index ", index, " is out of bounds");
          }
          indices_chunks[requested_index] = chunk_index;
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to