[GitHub] [arrow] pitrou commented on a change in pull request #12055: ARROW-11989: [C++][Python] Improve ChunkedArray's complexity for the access of elements

GitBox Mon, 14 Mar 2022 03:56:25 -0700


pitrou commented on a change in pull request #12055:
URL: https://github.com/apache/arrow/pull/12055#discussion_r825813874




##########
File path: cpp/src/arrow/chunk_resolver.cc
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/chunk_resolver.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/record_batch.h"
+
+namespace arrow {
+namespace internal {
+
+namespace {
+static inline std::vector<int64_t> MakeArraysOffsets(const ArrayVector& 
chunks) {
+  std::vector<int64_t> offsets(chunks.size());

Review comment:
       `chunks.size() + 1` would be better, otherwise the ending `push_back` 
may trigger a realloc.

##########
File path: cpp/src/arrow/chunk_resolver.cc
##########
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/chunk_resolver.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/record_batch.h"
+
+namespace arrow {
+namespace internal {
+
+namespace {
+static inline std::vector<int64_t> MakeArraysOffsets(const ArrayVector& 
chunks) {

Review comment:
       Nit, but `static` is redundant when you're already in the anonymous 
namespace.

##########
File path: cpp/src/arrow/chunked_array.cc
##########
@@ -147,13 +148,15 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray& other,
 }
 
 Result<std::shared_ptr<Scalar>> ChunkedArray::GetScalar(int64_t index) const {
-  for (const auto& chunk : chunks_) {
-    if (index < chunk->length()) {
-      return chunk->GetScalar(index);
-    }
-    index -= chunk->length();
+  if (!chunk_resolver_) {
+    chunk_resolver_ = internal::make_unique<internal::ChunkResolver>(chunks_);
+  }
+  const auto loc = chunk_resolver_->Resolve(index);
+  if (loc.chunk_index >= static_cast<int64_t>(chunks_.size())) {
+    return Status::IndexError("tried to refer to chunk ", loc.chunk_index,
+                              " but ChunkedArray is only ", chunks_.size(), " 
long");

Review comment:
       That's a weird error message. The cause is probably that the index 
passed to `GetScalar` is out of bounds, why not simply say that?

##########
File path: cpp/src/arrow/chunked_array.cc
##########
@@ -147,13 +148,15 @@ bool ChunkedArray::ApproxEquals(const ChunkedArray& other,
 }
 
 Result<std::shared_ptr<Scalar>> ChunkedArray::GetScalar(int64_t index) const {
-  for (const auto& chunk : chunks_) {
-    if (index < chunk->length()) {
-      return chunk->GetScalar(index);
-    }
-    index -= chunk->length();
+  if (!chunk_resolver_) {
+    chunk_resolver_ = internal::make_unique<internal::ChunkResolver>(chunks_);

Review comment:
       Here as well, there's an atomicity problem. Unfortunately, there are no 
facilities for atomic access to `unique_ptr`. You'd have to make this a 
`shared_ptr`; another possibility is to always initialise the `ChunkResolver` 
in the constructor.

##########
File path: cpp/src/arrow/compute/kernels/chunked_internal.h
##########
@@ -61,97 +62,21 @@ struct ResolvedChunk<Array> {
   bool IsNull() const { return array->IsNull(index); }
 };
 
-struct ChunkLocation {
-  int64_t chunk_index, index_in_chunk;
-};
-
-// An object that resolves an array chunk depending on the index.
-struct ChunkResolver {
-  explicit ChunkResolver(std::vector<int64_t> lengths)
-      : num_chunks_(static_cast<int64_t>(lengths.size())),
-        offsets_(MakeEndOffsets(std::move(lengths))),
-        cached_chunk_(0) {}
-
-  ChunkLocation Resolve(int64_t index) const {
-    // It is common for the algorithms below to make consecutive accesses at
-    // a relatively small distance from each other, hence often falling in
-    // the same chunk.
-    // This is trivial when merging (assuming each side of the merge uses
-    // its own resolver), but also in the inner recursive invocations of
-    // partitioning.
-    const bool cache_hit =
-        (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 
1]);
-    if (ARROW_PREDICT_TRUE(cache_hit)) {
-      return {cached_chunk_, index - offsets_[cached_chunk_]};
-    } else {
-      return ResolveMissBisect(index);
-    }
-  }
-
-  static ChunkResolver FromBatches(const RecordBatchVector& batches) {
-    std::vector<int64_t> lengths(batches.size());
-    std::transform(
-        batches.begin(), batches.end(), lengths.begin(),
-        [](const std::shared_ptr<RecordBatch>& batch) { return 
batch->num_rows(); });
-    return ChunkResolver(std::move(lengths));
-  }
-
- protected:
-  ChunkLocation ResolveMissBisect(int64_t index) const {
-    // Like std::upper_bound(), but hand-written as it can help the compiler.
-    const int64_t* raw_offsets = offsets_.data();
-    // Search [lo, lo + n)
-    int64_t lo = 0, n = num_chunks_;
-    while (n > 1) {
-      int64_t m = n >> 1;
-      int64_t mid = lo + m;
-      if (index >= raw_offsets[mid]) {
-        lo = mid;
-        n -= m;
-      } else {
-        n = m;
-      }
-    }
-    cached_chunk_ = lo;
-    return {lo, index - offsets_[lo]};
-  }
-
-  static std::vector<int64_t> MakeEndOffsets(std::vector<int64_t> lengths) {
-    int64_t offset = 0;
-    for (auto& v : lengths) {
-      const auto this_length = v;
-      v = offset;
-      offset += this_length;
-    }
-    lengths.push_back(offset);
-    return lengths;
-  }
-
-  int64_t num_chunks_;
-  std::vector<int64_t> offsets_;
-
-  mutable int64_t cached_chunk_;
-};
-
-struct ChunkedArrayResolver : protected ChunkResolver {
+struct ChunkedArrayResolver : protected ::arrow::internal::ChunkResolver {
   explicit ChunkedArrayResolver(const std::vector<const Array*>& chunks)
-      : ChunkResolver(MakeLengths(chunks)), chunks_(chunks) {}
+      : ::arrow::internal::ChunkResolver(chunks), chunks_(chunks) {}
 
   template <typename ArrayType>
   ResolvedChunk<ArrayType> Resolve(int64_t index) const {
-    const auto loc = ChunkResolver::Resolve(index);
-    return ResolvedChunk<ArrayType>(
-        checked_cast<const ArrayType*>(chunks_[loc.chunk_index]), 
loc.index_in_chunk);
+    const auto loc = ::arrow::internal::ChunkResolver::Resolve(index);
+    // if (loc.chunk_index >= static_cast<int64_t>(chunks_.size())) {
+    //   return Status::IndexError("tried to refer to chunk ", loc.chunk_index,
+    //                             " but ChunkedArray is only ", 
chunks_.size(), " long");
+    // }
+    return {checked_cast<const ArrayType*>(chunks_[loc.chunk_index]), 
loc.index_in_chunk};

Review comment:
       That's not a problem, this is an internal facility which assumes the 
caller is passing valid indices.

##########
File path: cpp/src/arrow/chunk_resolver.h
##########
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace internal {
+
+struct ChunkLocation {
+  int64_t chunk_index, index_in_chunk;
+};
+
+/// \class ChunkResolver
+/// \brief An object that resolves an array chunk depending on the index
+struct ChunkResolver {
+  /// \brief Construct a ChunkResolver from a vector of Arrays
+  explicit ChunkResolver(const ArrayVector& chunks);
+
+  /// \brief Construct a ChunkResolver from a vector of C-style Array pointers
+  explicit ChunkResolver(const std::vector<const Array*>& chunks);
+
+  /// \brief Construct a ChunkResolver from a vector of RecordBatches
+  explicit ChunkResolver(const RecordBatchVector& batches);
+
+  /// \brief Return a ChunkLocation containing the chunk index and in-chunk 
value index of
+  /// the chunked array at logical index
+  inline ChunkLocation Resolve(const int64_t index) const {
+    // It is common for the algorithms below to make consecutive accesses at
+    // a relatively small distance from each other, hence often falling in
+    // the same chunk.
+    // This is trivial when merging (assuming each side of the merge uses
+    // its own resolver), but also in the inner recursive invocations of
+    // partitioning.
+    if (offsets_.size() <= 1) {
+      return {0, index};
+    }
+    const bool cache_hit =
+        (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 
1]);
+    if (ARROW_PREDICT_TRUE(cache_hit)) {
+      return {cached_chunk_, index - offsets_[cached_chunk_]};
+    }
+    auto chunk_index = Bisect(index);
+    cached_chunk_ = chunk_index;
+    return {cached_chunk_, index - offsets_[cached_chunk_]};
+  }
+
+ protected:
+  /// \brief Find the chunk index corresponding to a value index using binary 
search
+  /// (bisect) algorithm
+  inline int64_t Bisect(const int64_t index) const {
+    // Like std::upper_bound(), but hand-written as it can help the compiler.
+    // Search [lo, lo + n)
+    int64_t lo = 0;
+    auto n = static_cast<int64_t>(offsets_.size());
+    while (n > 1) {
+      const int64_t m = n >> 1;
+      const int64_t mid = lo + m;
+      if (static_cast<int64_t>(index) >= offsets_[mid]) {
+        lo = mid;
+        n -= m;
+      } else {
+        n = m;
+      }
+    }
+    return lo;
+  }
+
+ private:
+  const std::vector<int64_t> offsets_;
+  mutable int64_t cached_chunk_ = 0;

Review comment:
       You'll have to make this a `std::atomic` since the cache could be 
accessed from several threads.

##########
File path: python/pyarrow/table.pxi
##########
@@ -209,19 +209,14 @@ cdef class ChunkedArray(_PandasConvertible):
         -------
         value : Scalar (index) or ChunkedArray (slice)
         """
-        if isinstance(key, slice):
+
+        if PySlice_Check(key):

Review comment:
       Is this supposed to micro-optimize the instance check? Do you get a 
significant speedup with this?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a change in pull request #12055: ARROW-11989: [C++][Python] Improve ChunkedArray's complexity for the access of elements

Reply via email to