zeroshade commented on code in PR #34550:
URL: https://github.com/apache/arrow/pull/34550#discussion_r1135885227


##########
cpp/src/arrow/array/array_run_end.cc:
##########
@@ -85,6 +86,53 @@ void RunEndEncodedArray::SetData(const 
std::shared_ptr<ArrayData>& data) {
   values_array_ = MakeArray(this->data()->child_data[1]);
 }
 
+namespace {
+
+template <typename RunEndType>
+Result<std::shared_ptr<Array>> MakeLogicalRunEnds(const RunEndEncodedArray& 
self,
+                                                  int64_t physical_offset,
+                                                  int64_t physical_length) {
+  using RunEndCType = typename RunEndType::c_type;
+  const auto* run_ends = self.data()->child_data[0]->GetValues<RunEndCType>(1);
+  NumericBuilder<RunEndType> builder;
+  RETURN_NOT_OK(builder.Resize(physical_length));
+  if (physical_length > 0) {
+    for (int64_t i = 0; i < physical_length - 1; i++) {
+      const auto run_end = run_ends[physical_offset + i] - self.offset();
+      DCHECK_LT(run_end, self.length());
+      RETURN_NOT_OK(builder.Append(static_cast<RunEndCType>(run_end)));
+    }
+    DCHECK_GE(run_ends[physical_offset + physical_length - 1] - self.offset(),
+              self.length());
+    RETURN_NOT_OK(builder.Append(static_cast<RunEndCType>(self.length())));
+  }

Review Comment:
   if `self.offset() == 0` then you could just return a slice of the underlying 
run ends child. would it make sense to do that?



##########
cpp/src/arrow/array/array_run_end.cc:
##########
@@ -85,6 +86,53 @@ void RunEndEncodedArray::SetData(const 
std::shared_ptr<ArrayData>& data) {
   values_array_ = MakeArray(this->data()->child_data[1]);
 }
 
+namespace {
+
+template <typename RunEndType>
+Result<std::shared_ptr<Array>> MakeLogicalRunEnds(const RunEndEncodedArray& 
self,
+                                                  int64_t physical_offset,
+                                                  int64_t physical_length) {
+  using RunEndCType = typename RunEndType::c_type;
+  const auto* run_ends = self.data()->child_data[0]->GetValues<RunEndCType>(1);
+  NumericBuilder<RunEndType> builder;
+  RETURN_NOT_OK(builder.Resize(physical_length));
+  if (physical_length > 0) {
+    for (int64_t i = 0; i < physical_length - 1; i++) {
+      const auto run_end = run_ends[physical_offset + i] - self.offset();
+      DCHECK_LT(run_end, self.length());
+      RETURN_NOT_OK(builder.Append(static_cast<RunEndCType>(run_end)));
+    }
+    DCHECK_GE(run_ends[physical_offset + physical_length - 1] - self.offset(),
+              self.length());
+    RETURN_NOT_OK(builder.Append(static_cast<RunEndCType>(self.length())));
+  }
+  return builder.Finish();
+}
+
+}  // namespace
+
+Result<std::shared_ptr<Array>> RunEndEncodedArray::LogicalRunEnds() const {
+  int64_t physical_offset = FindPhysicalOffset();
+  int64_t physical_length = FindPhysicalLength();
+  DCHECK(data()->child_data[0]->buffers[1]->is_cpu());
+
+  switch (run_ends_array_->type_id()) {
+    case Type::INT16:
+      return MakeLogicalRunEnds<Int16Type>(*this, physical_offset, 
physical_length);
+    case Type::INT32:
+      return MakeLogicalRunEnds<Int32Type>(*this, physical_offset, 
physical_length);
+    default:
+      break;
+  }
+  return MakeLogicalRunEnds<Int64Type>(*this, physical_offset, 
physical_length);

Review Comment:
   any reason to not just make this the default case above instead of the 
`break`? I just personally find that more readable, but it's not a big deal



##########
cpp/src/arrow/array/array_run_end.h:
##########
@@ -87,6 +87,19 @@ class ARROW_EXPORT RunEndEncodedArray : public Array {
   /// The physical offset to the array is applied.
   const std::shared_ptr<Array>& values() const { return values_array_; }
 
+  /// \brief Returns an array holding the logical indexes of each run end
+  ///
+  /// If a non-zero logical offset is set, this function allocates a new
+  /// array and rewrites all the run end values to be relative to the logical
+  /// offset and cuts the end of the array to the logical length.

Review Comment:
   your current implementation still allocates a new array even if the offset 
is zero



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to