pitrou commented on code in PR #41561:
URL: https://github.com/apache/arrow/pull/41561#discussion_r1598490780


##########
cpp/src/arrow/chunk_resolver.h:
##########
@@ -36,8 +39,42 @@ struct ChunkLocation {
 
   /// \brief Index of the value in the chunk
   ///
-  /// The value is undefined if chunk_index >= chunks.size()
+  /// The value is UNDEFINED if chunk_index >= chunks.size()
   int64_t index_in_chunk = 0;
+
+  /// \brief Create a ChunkLocation without asserting any preconditions.
+  static ChunkLocation Forge(int64_t chunk_index, int64_t index_in_chunk) {

Review Comment:
   Can we avoid inventing terminology that's currently not used in the project?



##########
cpp/src/arrow/chunk_resolver.h:
##########
@@ -97,12 +157,47 @@ struct ARROW_EXPORT ChunkResolver {
   /// \return ChunkLocation with a valid chunk_index if index is within
   ///         bounds, or with chunk_index == chunks.size() if logical index is
   ///         `>= chunked_array.length()`.
-  inline ChunkLocation ResolveWithChunkIndexHint(int64_t index,
-                                                 ChunkLocation hint) const {
+  inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) 
const {
     assert(hint.chunk_index < static_cast<int64_t>(offsets_.size()));
     const auto chunk_index =
         ResolveChunkIndex</*StoreCachedChunk=*/false>(index, hint.chunk_index);
-    return {chunk_index, index - offsets_[chunk_index]};
+    return ChunkLocation{*this, chunk_index, index - offsets_[chunk_index]};
+  }
+
+  /// \brief Resolve `n` logical indices to chunk indices.
+  ///
+  /// \pre logical_index_vec[i] < n (for valid chunk index results)
+  /// \pre out_chunk_index_vec has space for `n` elements
+  /// \post chunk_hint in [0, chunks.size()]
+  /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n)
+  /// \post if logical_index_vec[i] >= chunked_array.length(), then
+  ///       out_chunk_index_vec[i] == chunks.size()
+  ///       and out_index_in_chunk_vec[i] is undefined (can be out-of-bounds)
+  ///
+  /// \param n The number of logical indices to resolve
+  /// \param logical_index_vec The logical indices to resolve
+  /// \param out_chunk_index_vec The output array where the chunk indices will 
be written
+  /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany
+  /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the
+  ///                               within-chunk indices will be written
+  /// \return false iff chunks.size() > std::numeric_limits<IndexType>::max()
+  template <typename IndexType>
+  [[nodiscard]] std::enable_if_t<std::is_unsigned_v<IndexType>, bool> 
ResolveMany(
+      int64_t n, const IndexType* ARROW_RESTRICT logical_index_vec,
+      IndexType* ARROW_RESTRICT out_chunk_index_vec, IndexType chunk_hint = 0,
+      IndexType* ARROW_RESTRICT out_index_in_chunk_vec = NULLPTR) const {

Review Comment:
   Can we avoid all the `ARROW_RESTRICT` unless they have a genuine positive 
effect? This is making the code much less readable than it could be.



##########
cpp/src/arrow/chunk_resolver.h:
##########
@@ -97,12 +157,47 @@ struct ARROW_EXPORT ChunkResolver {
   /// \return ChunkLocation with a valid chunk_index if index is within
   ///         bounds, or with chunk_index == chunks.size() if logical index is
   ///         `>= chunked_array.length()`.
-  inline ChunkLocation ResolveWithChunkIndexHint(int64_t index,
-                                                 ChunkLocation hint) const {
+  inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) 
const {
     assert(hint.chunk_index < static_cast<int64_t>(offsets_.size()));
     const auto chunk_index =
         ResolveChunkIndex</*StoreCachedChunk=*/false>(index, hint.chunk_index);
-    return {chunk_index, index - offsets_[chunk_index]};
+    return ChunkLocation{*this, chunk_index, index - offsets_[chunk_index]};
+  }
+
+  /// \brief Resolve `n` logical indices to chunk indices.
+  ///
+  /// \pre logical_index_vec[i] < n (for valid chunk index results)
+  /// \pre out_chunk_index_vec has space for `n` elements
+  /// \post chunk_hint in [0, chunks.size()]
+  /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n)
+  /// \post if logical_index_vec[i] >= chunked_array.length(), then
+  ///       out_chunk_index_vec[i] == chunks.size()
+  ///       and out_index_in_chunk_vec[i] is undefined (can be out-of-bounds)
+  ///
+  /// \param n The number of logical indices to resolve
+  /// \param logical_index_vec The logical indices to resolve
+  /// \param out_chunk_index_vec The output array where the chunk indices will 
be written
+  /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany
+  /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the
+  ///                               within-chunk indices will be written
+  /// \return false iff chunks.size() > std::numeric_limits<IndexType>::max()
+  template <typename IndexType>
+  [[nodiscard]] std::enable_if_t<std::is_unsigned_v<IndexType>, bool> 
ResolveMany(
+      int64_t n, const IndexType* ARROW_RESTRICT logical_index_vec,
+      IndexType* ARROW_RESTRICT out_chunk_index_vec, IndexType chunk_hint = 0,
+      IndexType* ARROW_RESTRICT out_index_in_chunk_vec = NULLPTR) const {
+    if constexpr (sizeof(IndexType) < sizeof(uint64_t)) {
+      // The max value returned by Bisect is `offsets.size() - 1` (= 
chunks.size()).
+      constexpr auto kMaxIndexTypeValue = 
std::numeric_limits<IndexType>::max();
+      const bool chunk_index_fits_on_type =
+          static_cast<uint64_t>(offsets_.size() - 1) <= kMaxIndexTypeValue;
+      if (ARROW_PREDICT_FALSE(!chunk_index_fits_on_type)) {
+        return false;
+      }

Review Comment:
   What about `out_index_in_chunk_vec`? How do you know that the resulting 
index_in_chunk values won't overflow the index type?



##########
cpp/src/arrow/chunk_resolver.h:
##########
@@ -97,12 +157,47 @@ struct ARROW_EXPORT ChunkResolver {
   /// \return ChunkLocation with a valid chunk_index if index is within
   ///         bounds, or with chunk_index == chunks.size() if logical index is
   ///         `>= chunked_array.length()`.
-  inline ChunkLocation ResolveWithChunkIndexHint(int64_t index,
-                                                 ChunkLocation hint) const {
+  inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) 
const {
     assert(hint.chunk_index < static_cast<int64_t>(offsets_.size()));
     const auto chunk_index =
         ResolveChunkIndex</*StoreCachedChunk=*/false>(index, hint.chunk_index);
-    return {chunk_index, index - offsets_[chunk_index]};
+    return ChunkLocation{*this, chunk_index, index - offsets_[chunk_index]};
+  }
+
+  /// \brief Resolve `n` logical indices to chunk indices.
+  ///
+  /// \pre logical_index_vec[i] < n (for valid chunk index results)
+  /// \pre out_chunk_index_vec has space for `n` elements
+  /// \post chunk_hint in [0, chunks.size()]
+  /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n)
+  /// \post if logical_index_vec[i] >= chunked_array.length(), then
+  ///       out_chunk_index_vec[i] == chunks.size()
+  ///       and out_index_in_chunk_vec[i] is undefined (can be out-of-bounds)
+  ///
+  /// \param n The number of logical indices to resolve
+  /// \param logical_index_vec The logical indices to resolve
+  /// \param out_chunk_index_vec The output array where the chunk indices will 
be written
+  /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany
+  /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the
+  ///                               within-chunk indices will be written
+  /// \return false iff chunks.size() > std::numeric_limits<IndexType>::max()
+  template <typename IndexType>
+  [[nodiscard]] std::enable_if_t<std::is_unsigned_v<IndexType>, bool> 
ResolveMany(

Review Comment:
   Given that there is no specialization going on, can you `static_assert` 
instead of `enable_it_t`? It would make the signature much more readable.



##########
cpp/src/arrow/chunk_resolver.h:
##########
@@ -20,13 +20,16 @@
 #include <atomic>
 #include <cassert>
 #include <cstdint>
+#include <limits>
 #include <vector>
 
 #include "arrow/type_fwd.h"
 #include "arrow/util/macros.h"
 
 namespace arrow::internal {

Review Comment:
   Since everything here is internal, do you think we can rename this header 
file to `chunk_resolver_internal.h`? Not necessarily in this PR...



##########
cpp/src/arrow/chunk_resolver.h:
##########
@@ -36,8 +39,42 @@ struct ChunkLocation {
 
   /// \brief Index of the value in the chunk
   ///
-  /// The value is undefined if chunk_index >= chunks.size()
+  /// The value is UNDEFINED if chunk_index >= chunks.size()
   int64_t index_in_chunk = 0;
+
+  /// \brief Create a ChunkLocation without asserting any preconditions.
+  static ChunkLocation Forge(int64_t chunk_index, int64_t index_in_chunk) {
+    ChunkLocation loc;
+    loc.chunk_index = chunk_index;
+    loc.index_in_chunk = index_in_chunk;
+    return loc;
+  }
+
+  ChunkLocation() = default;
+
+ public:
+  template <class ChunkResolver>
+  ChunkLocation(const ChunkResolver& resolver, int64_t chunk_index,
+                int64_t index_in_chunk)
+      : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {

Review Comment:
   I don't understand why we need two different constructors while this is not 
a public API...



##########
cpp/src/arrow/chunk_resolver.h:
##########
@@ -97,12 +157,47 @@ struct ARROW_EXPORT ChunkResolver {
   /// \return ChunkLocation with a valid chunk_index if index is within
   ///         bounds, or with chunk_index == chunks.size() if logical index is
   ///         `>= chunked_array.length()`.
-  inline ChunkLocation ResolveWithChunkIndexHint(int64_t index,
-                                                 ChunkLocation hint) const {
+  inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) 
const {
     assert(hint.chunk_index < static_cast<int64_t>(offsets_.size()));
     const auto chunk_index =
         ResolveChunkIndex</*StoreCachedChunk=*/false>(index, hint.chunk_index);
-    return {chunk_index, index - offsets_[chunk_index]};
+    return ChunkLocation{*this, chunk_index, index - offsets_[chunk_index]};
+  }
+
+  /// \brief Resolve `n` logical indices to chunk indices.
+  ///
+  /// \pre logical_index_vec[i] < n (for valid chunk index results)
+  /// \pre out_chunk_index_vec has space for `n` elements
+  /// \post chunk_hint in [0, chunks.size()]
+  /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n)
+  /// \post if logical_index_vec[i] >= chunked_array.length(), then
+  ///       out_chunk_index_vec[i] == chunks.size()
+  ///       and out_index_in_chunk_vec[i] is undefined (can be out-of-bounds)
+  ///
+  /// \param n The number of logical indices to resolve
+  /// \param logical_index_vec The logical indices to resolve
+  /// \param out_chunk_index_vec The output array where the chunk indices will 
be written
+  /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany
+  /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the
+  ///                               within-chunk indices will be written
+  /// \return false iff chunks.size() > std::numeric_limits<IndexType>::max()
+  template <typename IndexType>
+  [[nodiscard]] std::enable_if_t<std::is_unsigned_v<IndexType>, bool> 
ResolveMany(
+      int64_t n, const IndexType* ARROW_RESTRICT logical_index_vec,

Review Comment:
   Call this `int64_t n_indices` perhaps?



##########
cpp/src/arrow/chunk_resolver.h:
##########
@@ -36,8 +39,42 @@ struct ChunkLocation {
 
   /// \brief Index of the value in the chunk
   ///
-  /// The value is undefined if chunk_index >= chunks.size()
+  /// The value is UNDEFINED if chunk_index >= chunks.size()
   int64_t index_in_chunk = 0;
+
+  /// \brief Create a ChunkLocation without asserting any preconditions.
+  static ChunkLocation Forge(int64_t chunk_index, int64_t index_in_chunk) {
+    ChunkLocation loc;
+    loc.chunk_index = chunk_index;
+    loc.index_in_chunk = index_in_chunk;
+    return loc;
+  }
+
+  ChunkLocation() = default;
+
+ public:

Review Comment:
   The above are also public as this is a `struct`.



##########
cpp/src/arrow/chunk_resolver.h:
##########
@@ -97,12 +157,47 @@ struct ARROW_EXPORT ChunkResolver {
   /// \return ChunkLocation with a valid chunk_index if index is within
   ///         bounds, or with chunk_index == chunks.size() if logical index is
   ///         `>= chunked_array.length()`.
-  inline ChunkLocation ResolveWithChunkIndexHint(int64_t index,
-                                                 ChunkLocation hint) const {
+  inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) 
const {
     assert(hint.chunk_index < static_cast<int64_t>(offsets_.size()));
     const auto chunk_index =
         ResolveChunkIndex</*StoreCachedChunk=*/false>(index, hint.chunk_index);
-    return {chunk_index, index - offsets_[chunk_index]};
+    return ChunkLocation{*this, chunk_index, index - offsets_[chunk_index]};
+  }
+
+  /// \brief Resolve `n` logical indices to chunk indices.
+  ///
+  /// \pre logical_index_vec[i] < n (for valid chunk index results)
+  /// \pre out_chunk_index_vec has space for `n` elements
+  /// \post chunk_hint in [0, chunks.size()]
+  /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n)
+  /// \post if logical_index_vec[i] >= chunked_array.length(), then
+  ///       out_chunk_index_vec[i] == chunks.size()
+  ///       and out_index_in_chunk_vec[i] is undefined (can be out-of-bounds)
+  ///
+  /// \param n The number of logical indices to resolve
+  /// \param logical_index_vec The logical indices to resolve
+  /// \param out_chunk_index_vec The output array where the chunk indices will 
be written
+  /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany
+  /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the
+  ///                               within-chunk indices will be written
+  /// \return false iff chunks.size() > std::numeric_limits<IndexType>::max()
+  template <typename IndexType>
+  [[nodiscard]] std::enable_if_t<std::is_unsigned_v<IndexType>, bool> 
ResolveMany(

Review Comment:
   Also, I'm not sure why the indices are supposed to be unsigned here, or why 
we want to support several index types.



##########
cpp/src/arrow/chunk_resolver.cc:
##########
@@ -54,6 +54,52 @@ inline std::vector<int64_t> MakeChunksOffsets(const 
std::vector<T>& chunks) {
   offsets[chunks.size()] = offset;
   return offsets;
 }
+
+/// \pre all the pre-conditions of ChunkResolver::ResolveMany()
+/// \pre num_offsets - 1 <= std::numeric_limits<IndexType>::max()
+template <typename IndexType>
+void ResolveManyInline(size_t num_offsets, const int64_t* ARROW_RESTRICT 
offsets,
+                       int64_t n, const IndexType* ARROW_RESTRICT 
logical_index_vec,
+                       IndexType* ARROW_RESTRICT out_chunk_index_vec,
+                       IndexType chunk_hint,
+                       IndexType* ARROW_RESTRICT out_index_in_chunk_vec) {
+  const auto num_chunks = static_cast<IndexType>(num_offsets - 1);
+  // chunk_hint in [0, num_offsets) per the precondition.
+  for (int64_t i = 0; i < n; i++) {
+    const auto index = static_cast<uint64_t>(logical_index_vec[i]);
+    if (index >= static_cast<uint64_t>(offsets[chunk_hint]) &&
+        (chunk_hint == num_chunks ||

Review Comment:
   Hmm, when do we have `chunk_hint == num_chunks`? If this is for take/filter, 
we are merely selecting existing values, not inserting new ones at the end...



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -747,15 +747,13 @@ class TableSorter {
     auto& comparator = comparator_;
     const auto& first_sort_key = sort_keys_[0];
 
-    ChunkLocation left_loc{0, 0};
-    ChunkLocation right_loc{0, 0};
+    ChunkLocation left_loc;

Review Comment:
   I don't remember for sure, but this was probably done to avoid 
"uninitialized variable" warnings in optimized builds.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to