This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 422174e9b6 GH-35675 [C++] Don't copy the ArraySpan into the REE 
ArraySpan (#35677)
422174e9b6 is described below

commit 422174e9b6c926fdb6f55741f1484e9b3e0cb001
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Tue May 30 12:27:52 2023 -0300

    GH-35675 [C++] Don't copy the ArraySpan into the REE ArraySpan (#35677)
    
    ### Rationale for this change
    
    Copying ArraySpan is not without issues (see #35581), and this change seems 
good by itself as it makes it easier for the compiler to SROA [1] the whole 
`RunEndEncodedArraySpan` class.
    
    [1] 
https://www.llvm.org/docs/Passes.html#sroa-scalar-replacement-of-aggregates
    
    ### What changes are included in this PR?
    
     - Make `array_span` private in `ree_util::RunEndEncodedArraySpan`
     - Allow construction of  `ree_util::RunEndEncodedArraySpan` with separate 
`offset` and `length`
     - Change `RunEndEncodedBuilder` to avoid a copy of the `ArraySpan` being 
iterated over
     - Prevent instantiation based on implicit conversion from `ArrayData` to 
`ArraySpan`
    
    ### Are these changes tested?
    
    Yes, by the existing unit tests of the various uses of 
`RunEndEncodedArraySpan`.
    
    ### Are there any user-facing changes?
    
    No meaningful changes other than some small tweaks in the interface of a 
recently added class.
    
    * Closes: #35675
    
    Authored-by: Felipe Oliveira Carvalho <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/array/builder_run_end.cc | 23 ++++++++---------
 cpp/src/arrow/array/builder_run_end.h  |  2 +-
 cpp/src/arrow/util/ree_util.h          | 47 ++++++++++++++++++++++++----------
 cpp/src/arrow/util/ree_util_test.cc    |  6 +++--
 4 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/cpp/src/arrow/array/builder_run_end.cc 
b/cpp/src/arrow/array/builder_run_end.cc
index ec285b8e3a..cff8d72952 100644
--- a/cpp/src/arrow/array/builder_run_end.cc
+++ b/cpp/src/arrow/array/builder_run_end.cc
@@ -220,18 +220,21 @@ Status RunEndEncodedBuilder::AppendScalars(const 
ScalarVector& scalars) {
 }
 
 template <typename RunEndCType>
-Status RunEndEncodedBuilder::DoAppendArray(const ArraySpan& to_append) {
-  DCHECK_GT(to_append.length, 0);
+Status RunEndEncodedBuilder::DoAppendArraySlice(const ArraySpan& array, 
int64_t offset,
+                                                int64_t length) {
+  ARROW_DCHECK(offset + length <= array.length);
+  DCHECK_GT(length, 0);
   DCHECK(!value_run_builder_->has_open_run());
 
-  ree_util::RunEndEncodedArraySpan<RunEndCType> ree_span(to_append);
+  ree_util::RunEndEncodedArraySpan<RunEndCType> ree_span(array, array.offset + 
offset,
+                                                         length);
   const int64_t physical_offset = ree_span.PhysicalIndex(0);
   const int64_t physical_length =
       ree_span.PhysicalIndex(ree_span.length() - 1) + 1 - physical_offset;
 
   RETURN_NOT_OK(ReservePhysical(physical_length));
 
-  // Append all the run ends from to_append
+  // Append all the run ends from array sliced by offset and length
   for (auto it = ree_span.iterator(0, physical_offset); !it.is_end(ree_span); 
++it) {
     const int64_t run_end = committed_logical_length_ + it.run_length();
     RETURN_NOT_OK(DoAppendRunEnd<RunEndCType>(run_end));
@@ -240,14 +243,13 @@ Status RunEndEncodedBuilder::DoAppendArray(const 
ArraySpan& to_append) {
 
   // Append all the values directly
   RETURN_NOT_OK(value_run_builder_->AppendRunCompressedArraySlice(
-      ree_util::ValuesArray(to_append), physical_offset, physical_length));
+      ree_util::ValuesArray(array), physical_offset, physical_length));
 
   return Status::OK();
 }
 
 Status RunEndEncodedBuilder::AppendArraySlice(const ArraySpan& array, int64_t 
offset,
                                               int64_t length) {
-  ARROW_DCHECK(offset + length <= array.length);
   ARROW_DCHECK(array.type->Equals(type_));
 
   // Ensure any open run is closed before appending the array slice.
@@ -257,18 +259,15 @@ Status RunEndEncodedBuilder::AppendArraySlice(const 
ArraySpan& array, int64_t of
     return Status::OK();
   }
 
-  ArraySpan to_append = array;
-  to_append.SetSlice(array.offset + offset, length);
-
   switch (type_->run_end_type()->id()) {
     case Type::INT16:
-      RETURN_NOT_OK(DoAppendArray<int16_t>(to_append));
+      RETURN_NOT_OK(DoAppendArraySlice<int16_t>(array, offset, length));
       break;
     case Type::INT32:
-      RETURN_NOT_OK(DoAppendArray<int32_t>(to_append));
+      RETURN_NOT_OK(DoAppendArraySlice<int32_t>(array, offset, length));
       break;
     case Type::INT64:
-      RETURN_NOT_OK(DoAppendArray<int64_t>(to_append));
+      RETURN_NOT_OK(DoAppendArraySlice<int64_t>(array, offset, length));
       break;
     default:
       return Status::Invalid("Invalid type for run ends array: ", 
type_->run_end_type());
diff --git a/cpp/src/arrow/array/builder_run_end.h 
b/cpp/src/arrow/array/builder_run_end.h
index 9764c57992..ac92efbd0d 100644
--- a/cpp/src/arrow/array/builder_run_end.h
+++ b/cpp/src/arrow/array/builder_run_end.h
@@ -273,7 +273,7 @@ class ARROW_EXPORT RunEndEncodedBuilder : public 
ArrayBuilder {
 
   // Pre-condition: !value_run_builder_.has_open_run()
   template <typename RunEndCType>
-  Status DoAppendArray(const ArraySpan& to_append);
+  Status DoAppendArraySlice(const ArraySpan& array, int64_t offset, int64_t 
length);
 
   template <typename RunEndCType>
   Status DoAppendRunEnd(int64_t run_end);
diff --git a/cpp/src/arrow/util/ree_util.h b/cpp/src/arrow/util/ree_util.h
index 8c8f58a151..e708eb0b59 100644
--- a/cpp/src/arrow/util/ree_util.h
+++ b/cpp/src/arrow/util/ree_util.h
@@ -227,20 +227,39 @@ class RunEndEncodedArraySpan {
     int64_t physical_pos_;
   };
 
-  explicit RunEndEncodedArraySpan(const ArrayData& data)
-      : RunEndEncodedArraySpan(ArraySpan{data}) {}
+  // Prevent implicit ArrayData -> ArraySpan conversion in
+  // RunEndEncodedArraySpan instantiation.
+  explicit RunEndEncodedArraySpan(const ArrayData& data) = delete;
 
-  explicit RunEndEncodedArraySpan(const ArraySpan& array_span)
-      : array_span{array_span}, run_ends_(RunEnds<RunEndCType>(array_span)) {
-    assert(array_span.type->id() == Type::RUN_END_ENCODED);
+  /// \brief Construct a RunEndEncodedArraySpan from an ArraySpan and new
+  /// absolute offset and length.
+  ///
+  /// RunEndEncodedArraySpan{span, off, len} is equivalent to:
+  ///
+  ///   span.SetSlice(off, len);
+  ///   RunEndEncodedArraySpan{span}
+  ///
+  /// ArraySpan::SetSlice() updates the null_count to kUnknownNullCount, but
+  /// we don't need that here as REE arrays have null_count set to 0 by
+  /// convention.
+  explicit RunEndEncodedArraySpan(const ArraySpan& array_span, int64_t offset,
+                                  int64_t length)
+      : array_span_{array_span},
+        run_ends_(RunEnds<RunEndCType>(array_span_)),
+        length_(length),
+        offset_(offset) {
+    assert(array_span_.type->id() == Type::RUN_END_ENCODED);
   }
 
-  int64_t length() const { return array_span.length; }
-  int64_t offset() const { return array_span.offset; }
+  explicit RunEndEncodedArraySpan(const ArraySpan& array_span)
+      : RunEndEncodedArraySpan(array_span, array_span.offset, 
array_span.length) {}
+
+  int64_t offset() const { return offset_; }
+  int64_t length() const { return length_; }
 
   int64_t PhysicalIndex(int64_t logical_pos) const {
-    return internal::FindPhysicalIndex(run_ends_, 
RunEndsArray(array_span).length,
-                                       logical_pos, offset());
+    return internal::FindPhysicalIndex(run_ends_, 
RunEndsArray(array_span_).length,
+                                       logical_pos, offset_);
   }
 
   /// \brief Create an iterator from a logical position and its
@@ -296,9 +315,9 @@ class RunEndEncodedArraySpan {
                     (length() == 0) ? PhysicalIndex(0) : 
PhysicalIndex(length() - 1) + 1);
   }
 
-  // Pre-condition: physical_pos < RunEndsArray(array_span).length);
+  // Pre-condition: physical_pos < RunEndsArray(array_span_).length);
   inline int64_t run_end(int64_t physical_pos) const {
-    assert(physical_pos < RunEndsArray(array_span).length);
+    assert(physical_pos < RunEndsArray(array_span_).length);
     // Logical index of the end of the run at physical_pos with offset applied
     const int64_t logical_run_end =
         std::max<int64_t>(static_cast<int64_t>(run_ends_[physical_pos]) - 
offset(), 0);
@@ -306,11 +325,11 @@ class RunEndEncodedArraySpan {
     return std::min(logical_run_end, length());
   }
 
- public:
-  const ArraySpan array_span;
-
  private:
+  const ArraySpan& array_span_;
   const RunEndCType* run_ends_;
+  const int64_t length_;
+  const int64_t offset_;
 };
 
 /// \brief Iterate over two run-end encoded arrays in runs or sub-runs that are
diff --git a/cpp/src/arrow/util/ree_util_test.cc 
b/cpp/src/arrow/util/ree_util_test.cc
index 5f630c3028..966cbd8f38 100644
--- a/cpp/src/arrow/util/ree_util_test.cc
+++ b/cpp/src/arrow/util/ree_util_test.cc
@@ -167,8 +167,10 @@ TYPED_TEST_P(ReeUtilTest, MergedRunsInterator) {
                        RunEndEncodedArray::Make(2050, right_run_ends, 
right_child));
   left_array = left_array->Slice(left_parent_offset);
   right_array = right_array->Slice(right_parent_offset);
-  const RunEndEncodedArraySpan<TypeParam> left_ree_span(*left_array->data());
-  const RunEndEncodedArraySpan<TypeParam> right_ree_span(*right_array->data());
+  ArraySpan left_array_span(*left_array->data());
+  ArraySpan right_array_span(*right_array->data());
+  const RunEndEncodedArraySpan<TypeParam> left_ree_span(left_array_span);
+  const RunEndEncodedArraySpan<TypeParam> right_ree_span(right_array_span);
 
   {
     ARROW_SCOPED_TRACE("iterate over merged(left, right)");

Reply via email to