pitrou commented on code in PR #33641:
URL: https://github.com/apache/arrow/pull/33641#discussion_r1073318297


##########
cpp/src/arrow/CMakeLists.txt:
##########
@@ -140,13 +140,16 @@ set(ARROW_SRCS
     array/array_binary.cc
     array/array_decimal.cc
     array/array_dict.cc
+    array/array_encoded.h
+    array/array_encoded.cc

Review Comment:
   1) `.h` files needn't appear here.
   2) can we name this something else? "encoded" is much too vague. For example 
`array_run_end.cc`
   



##########
cpp/src/arrow/type_test.cc:
##########
@@ -1927,6 +1927,38 @@ TEST(TypesTest, TestDecimalEquals) {
   AssertTypeNotEqual(t5, t10);
 }
 
+TEST(TypesTest, TestRunEndEncodedType) {
+  auto int8_make_shared = std::make_shared<RunEndEncodedType>(int32(), 
list(int8()));
+  auto int8_factory = run_end_encoded(int32(), list(int8()));
+  auto int32_factory = run_end_encoded(int32(), list(int32()));

Review Comment:
   Why "factory"?



##########
cpp/src/arrow/type_traits.h:
##########
@@ -1177,6 +1197,7 @@ constexpr bool is_nested(Type::type type_id) {
     case Type::STRUCT:
     case Type::SPARSE_UNION:
     case Type::DENSE_UNION:
+    case Type::RUN_END_ENCODED:

Review Comment:
   I'm not sure this is right? These types are semantically (logically) nested. 
Run-end encoded arrays are only physically nested. I don't know which choice 
makes the most sense and/or requires the least special-casing.
   
   @lidavidm @bkietz  What do you think?



##########
cpp/src/arrow/type.h:
##########
@@ -1196,6 +1196,49 @@ class ARROW_EXPORT DenseUnionType : public UnionType {
   std::string name() const override { return "dense_union"; }
 };
 
+/// \brief Base class for types using a special-encoding (e.g. 
run-end-encoding)
+///
+/// This allows users to check if arrays are logically compatible even when the
+/// physical encoding of the types differs.
+class ARROW_EXPORT EncodedType : public NestedType {
+ public:
+  EncodedType(Type::type id, std::shared_ptr<DataType> encoded_type)
+      : NestedType(id), encoded_type_{std::move(encoded_type)} {}
+
+  const std::shared_ptr<DataType>& encoded_type() const { return 
encoded_type_; }

Review Comment:
   Note that `DictionaryType` uses `value_type` not `encoded_type`. It would 
also match the child name which is "values".



##########
cpp/src/arrow/pretty_print.cc:
##########
@@ -377,6 +378,22 @@ class ArrayPrinter : public PrettyPrinter {
     return PrettyPrint(*array.indices(), ChildOptions(true), sink_);
   }
 
+  Status Visit(const RunEndEncodedArray& array) {
+    Newline();
+    Indent();
+    Write("-- run ends array (offset: ");
+    Write(std::to_string(array.offset()));
+    Write(", logical length: ");
+    Write(std::to_string(array.length()));
+    Write(")\n");

Review Comment:
   Why write out the offset and length? We don't do this for dictionary 
indices, for example.



##########
cpp/src/arrow/type.h:
##########
@@ -1196,6 +1196,49 @@ class ARROW_EXPORT DenseUnionType : public UnionType {
   std::string name() const override { return "dense_union"; }
 };
 
+/// \brief Base class for types using a special-encoding (e.g. 
run-end-encoding)

Review Comment:
   Uh, I'm not sure that's useful, but it sounds confusing (especially as this 
doesn't seem to cover dictionary types, which are also "encoded").



##########
cpp/src/arrow/compute/kernel.h:
##########
@@ -141,6 +141,14 @@ ARROW_EXPORT std::shared_ptr<TypeMatcher> 
FixedSizeBinaryLike();
 // Type)
 ARROW_EXPORT std::shared_ptr<TypeMatcher> Primitive();
 
+/// \brief Match run-end encoded types that encode specific plain data types
+///
+/// @param[in] run_ends_type_matcher a matcher that is applied to the run_ends 
field
+/// @param[in] encoded_type_matcher a matcher that is applied to the values 
field
+ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(
+    std::shared_ptr<TypeMatcher> run_ends_type_matcher,

Review Comment:
   Is it generally useful to pass a run ends matcher? The implementation will 
probably be the same, just templated by run ends type.



##########
cpp/src/arrow/scalar.cc:
##########
@@ -1149,7 +1149,7 @@ Result<std::shared_ptr<Scalar>> 
Scalar::CastTo(std::shared_ptr<DataType> to) con
   if (is_valid) {
     out->is_valid = true;
     ToTypeVisitor unpack_to_type{*this, to, out.get()};
-    RETURN_NOT_OK(VisitTypeInline(*to, &unpack_to_type));
+    RETURN_NOT_OK(VisitScalarTypeInline(*to, &unpack_to_type));

Review Comment:
   Hmm, we should definitely define a `RunEndEncodedScalar` instead of hacking 
our way around its lack, IMHO.



##########
cpp/src/arrow/pretty_print.cc:
##########
@@ -41,6 +41,7 @@
 #include "arrow/util/formatting.h"
 #include "arrow/util/int_util_overflow.h"
 #include "arrow/util/key_value_metadata.h"
+#include "arrow/util/ree_util.h"

Review Comment:
   Doesn't seem required?



##########
cpp/src/arrow/array.h:
##########
@@ -34,10 +34,15 @@
 /// @{
 /// @}
 
+/// \defgroup encoded-arrays Concrete classes for encoded arrays

Review Comment:
   "run-end-encoded arrays"



##########
cpp/src/arrow/array/concatenate.cc:
##########
@@ -436,6 +438,23 @@ class ConcatenateImpl {
     return Status::OK();
   }
 
+  Status Visit(const RunEndEncodedType& type) {

Review Comment:
   These changes seem to lack tests, don't they?



##########
cpp/src/arrow/type.h:
##########
@@ -2103,6 +2146,7 @@ static inline bool HasValidityBitmap(Type::type id) {
     case Type::NA:
     case Type::DENSE_UNION:
     case Type::SPARSE_UNION:
+    case Type::RUN_END_ENCODED:

Review Comment:
   Unrelated, but could make this function `constexpr` I think.



##########
cpp/src/arrow/compare.cc:
##########
@@ -700,6 +734,15 @@ class TypeEqualsVisitor {
     return Status::OK();
   }
 
+  Status Visit(const RunEndEncodedType& left) {
+    const auto& right = checked_cast<const RunEndEncodedType&>(right_);
+    // NOTE(felipecrv): the type of the run_ends might differ, but that doesn't
+    // change the logical meaning of the type so only the encoded type is
+    // compared.

Review Comment:
   That's inconsistent with how dictionary equality works (see above), and 
would also break `CompareRunEndEncoded` above.



##########
cpp/src/arrow/testing/json_internal.cc:
##########
@@ -437,6 +437,10 @@ class SchemaWriter {
 
   Status Visit(const DictionaryType& type) { return 
VisitType(*type.value_type()); }
 
+  Status Visit(const RunEndEncodedType& type) {
+    return Status::NotImplemented("run-end encoded type in JSON");

Review Comment:
   Why not use `Status::NotImplemented(type.name())` as below?



##########
cpp/src/arrow/compare.cc:
##########
@@ -459,6 +473,26 @@ class RangeDataEqualsImpl {
     return Status::OK();
   }
 
+  template <typename RunEndsType>
+  Status CompareRunEndEncoded() {
+    auto left_span = ArraySpan(left_);
+    auto right_span = ArraySpan(right_);
+    left_span.SetSlice(left_.offset + left_start_idx_, range_length_);
+    right_span.SetSlice(right_.offset + right_start_idx_, range_length_);
+    for (auto it = ree_util::MergedRunsIterator<RunEndsType, 
RunEndsType>(left_span,
+                                                                          
right_span);

Review Comment:
   That only works if both run-ends types are identical, which is inconsistent 
with type equality definition below.



##########
cpp/src/arrow/array/builder_encoded.h:
##########
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_base.h"
+
+namespace arrow {
+
+/// \addtogroup encoded-builders
+///
+/// @{
+
+// ----------------------------------------------------------------------
+// RunEndEncoded builder
+
+class ARROW_EXPORT RunEndEncodedBuilder : public ArrayBuilder {
+ public:
+  RunEndEncodedBuilder(MemoryPool* pool,
+                       const std::shared_ptr<ArrayBuilder>& run_end_builder,
+                       const std::shared_ptr<ArrayBuilder>& value_builder,
+                       std::shared_ptr<DataType> type);
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) final;
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Resize(int64_t capacity) final;
+  void Reset() final;
+
+  Status Finish(std::shared_ptr<RunEndEncodedArray>* out) { return 
FinishTyped(out); }
+  Status AppendNull() final;
+  Status AppendNulls(int64_t length) final;
+  Status AppendEmptyValue() final;
+  Status AppendEmptyValues(int64_t length) final;
+  Status AppendScalar(const Scalar& scalar, int64_t n_repeats) final;
+  Status AppendScalars(const ScalarVector& scalars) final;
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t 
length) final;
+  std::shared_ptr<DataType> type() const final;
+
+  /// \brief Allocate enough memory for a given number of runs. Like Resize on 
non-REE
+  /// builders, it does not account for variable size data.

Review Comment:
   This is not clear. Why "physical"? What does the capacity mean (number of 
offsets? number of values?)?
   
   Also, generally a Reserve facility is more useful than a Resize one.



##########
cpp/src/arrow/array/builder_encoded.cc:
##########
@@ -0,0 +1,213 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/builder_encoded.h"
+#include "arrow/array/builder_primitive.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "arrow/scalar.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/ree_util.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// RunEndEncodedBuilder
+
+RunEndEncodedBuilder::RunEndEncodedBuilder(
+    MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& run_end_builder,
+    const std::shared_ptr<ArrayBuilder>& value_builder, 
std::shared_ptr<DataType> type)
+    : ArrayBuilder(pool), 
type_(internal::checked_pointer_cast<RunEndEncodedType>(type)) {
+  children_ = {std::move(run_end_builder), std::move(value_builder)};
+}
+
+Status RunEndEncodedBuilder::Resize(int64_t) {
+  return Status::NotImplemented(

Review Comment:
   This seems rather hostile. If we don't know how to resize, let's please not 
expose a method for it.



##########
cpp/src/arrow/visitor_generate.h:
##########
@@ -35,34 +35,39 @@ namespace arrow {
   ACTION(Float);                                     \
   ACTION(Double)
 
-#define ARROW_GENERATE_FOR_ALL_TYPES(ACTION)    \
-  ACTION(Null);                                 \
-  ACTION(Boolean);                              \
-  ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \
-  ACTION(String);                               \
-  ACTION(Binary);                               \
-  ACTION(LargeString);                          \
-  ACTION(LargeBinary);                          \
-  ACTION(FixedSizeBinary);                      \
-  ACTION(Duration);                             \
-  ACTION(Date32);                               \
-  ACTION(Date64);                               \
-  ACTION(Timestamp);                            \
-  ACTION(Time32);                               \
-  ACTION(Time64);                               \
-  ACTION(MonthDayNanoInterval);                 \
-  ACTION(MonthInterval);                        \
-  ACTION(DayTimeInterval);                      \
-  ACTION(Decimal128);                           \
-  ACTION(Decimal256);                           \
-  ACTION(List);                                 \
-  ACTION(LargeList);                            \
-  ACTION(Map);                                  \
-  ACTION(FixedSizeList);                        \
-  ACTION(Struct);                               \
-  ACTION(SparseUnion);                          \
-  ACTION(DenseUnion);                           \
-  ACTION(Dictionary);                           \
+// all types that can exist as a Scalar

Review Comment:
   As I said in another comment, we should add `RunEndEncodedScalar` instead of 
hacking around it, IMHO.



##########
cpp/src/arrow/array/data.cc:
##########
@@ -195,6 +195,7 @@ int GetNumBuffers(const DataType& type) {
     case Type::NA:
     case Type::STRUCT:
     case Type::FIXED_SIZE_LIST:
+    case Type::RUN_END_ENCODED:
       return 1;

Review Comment:
   Why 1? Run-end encoded arrays have no top-level buffers (no null bitmap), do 
they? Or do we reserve an empty null bitmap slot?



##########
cpp/src/arrow/array/concatenate.cc:
##########
@@ -436,6 +438,23 @@ class ConcatenateImpl {
     return Status::OK();
   }
 
+  Status Visit(const RunEndEncodedType& type) {
+    int64_t physical_length = 0;
+    for (auto input : in_) {

Review Comment:
   Needn't copy the `shared_ptr`
   ```suggestion
       for (const auto& input : in_) {
   ```



##########
cpp/src/arrow/array/builder_encoded.cc:
##########
@@ -0,0 +1,213 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/builder_encoded.h"
+#include "arrow/array/builder_primitive.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "arrow/scalar.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/ree_util.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// RunEndEncodedBuilder
+
+RunEndEncodedBuilder::RunEndEncodedBuilder(
+    MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& run_end_builder,
+    const std::shared_ptr<ArrayBuilder>& value_builder, 
std::shared_ptr<DataType> type)
+    : ArrayBuilder(pool), 
type_(internal::checked_pointer_cast<RunEndEncodedType>(type)) {
+  children_ = {std::move(run_end_builder), std::move(value_builder)};
+}
+
+Status RunEndEncodedBuilder::Resize(int64_t) {
+  return Status::NotImplemented(
+      "Resizing an REE for a given number of logical elements is not possible, 
since the "
+      "physical length will vary depending on the contents. To allocate memory 
for a "
+      "certain number of runs, use ResizePhysical.");
+}
+
+void RunEndEncodedBuilder::Reset() {
+  capacity_ = length_ = null_count_ = 0;
+  value_builder().Reset();
+  run_end_builder().Reset();
+  current_value_.reset();
+  run_start_ = 0;
+}
+
+Status RunEndEncodedBuilder::AppendNull() { return AppendNulls(1); }
+
+Status RunEndEncodedBuilder::AppendNulls(int64_t length) {
+  if (length == 0) {
+    return Status::OK();
+  }
+  if (current_value_) {
+    RETURN_NOT_OK(FinishRun());
+    current_value_ = {};
+  }
+  return AddLength(length);
+}
+
+Status RunEndEncodedBuilder::AppendEmptyValue() { return AppendNull(); }
+
+Status RunEndEncodedBuilder::AppendEmptyValues(int64_t length) {
+  return AppendNulls(length);
+}
+
+Status RunEndEncodedBuilder::AppendScalar(const Scalar& scalar, int64_t 
n_repeats) {
+  if (n_repeats == 0) {
+    return Status::OK();
+  }
+  if (!scalar.is_valid) {
+    return AppendNulls(n_repeats);
+  }
+  if (!current_value_ || !current_value_->Equals(scalar)) {
+    RETURN_NOT_OK(FinishRun());
+    current_value_ = scalar.shared_from_this();
+  }
+  return AddLength(n_repeats);
+}
+
+Status RunEndEncodedBuilder::AppendScalars(const ScalarVector& scalars) {
+  for (auto scalar : scalars) {
+    RETURN_NOT_OK(AppendScalar(*scalar, 1));
+  }
+  return Status::OK();
+}
+
+template <typename RunEndsType>
+Status RunEndEncodedBuilder::DoAppendArray(const ArraySpan& to_append) {
+  const int64_t physical_offset = ree_util::FindPhysicalOffset(to_append);
+  int64_t physical_length = 0;
+  for (auto it = ree_util::MergedRunsIterator<RunEndsType>(to_append);
+       it != ree_util::MergedRunsIterator(); ++it) {
+    physical_length++;
+    length_ += it.run_length();
+    RETURN_NOT_OK(DoAppendRunEnd<RunEndsType>());
+  }
+  return value_builder().AppendArraySlice(ree_util::ValuesArray(to_append),
+                                          physical_offset, physical_length);
+}
+
+Status RunEndEncodedBuilder::AppendArraySlice(const ArraySpan& array, int64_t 
offset,
+                                              int64_t length) {
+  // Finish eventual runs started using AppendScalars() and others before. We 
don't
+  // attempt to merge them with the runs from the appended array slice for now
+  RETURN_NOT_OK(FinishRun());
+
+  ARROW_DCHECK(offset + length <= array.length);
+  ARROW_DCHECK(array.type->Equals(type_));
+  // Create a slice of the slice for the part we actually want to add
+  ArraySpan to_append = array;
+  to_append.SetSlice(array.offset + offset, length);
+
+  switch (type_->run_ends_type()->id()) {
+    case Type::INT16:
+      RETURN_NOT_OK(DoAppendArray<int16_t>(to_append));
+      break;
+    case Type::INT32:
+      RETURN_NOT_OK(DoAppendArray<int32_t>(to_append));
+      break;
+    case Type::INT64:
+      RETURN_NOT_OK(DoAppendArray<int64_t>(to_append));
+      break;
+    default:
+      return Status::Invalid("Invalid type for run ends array: ", 
type_->run_ends_type());
+  }
+
+  // next run is not merged either
+  run_start_ = length_;
+  return Status::OK();
+}
+
+std::shared_ptr<DataType> RunEndEncodedBuilder::type() const { return type_; }
+
+Status RunEndEncodedBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  ARROW_ASSIGN_OR_RAISE(auto run_ends_array, run_end_builder().Finish());
+  ARROW_ASSIGN_OR_RAISE(auto values_array, value_builder().Finish());
+  ARROW_ASSIGN_OR_RAISE(auto ree_array,
+                        RunEndEncodedArray::Make(run_ends_array, values_array, 
length_));
+  *out = std::move(ree_array->data());
+  return Status::OK();
+}
+
+template <typename RunEndsType>
+Status RunEndEncodedBuilder::DoAppendRunEnd() {
+  return internal::checked_cast<typename 
CTypeTraits<RunEndsType>::BuilderType*>(
+             children_[0].get())
+      ->Append(static_cast<RunEndsType>(length_));
+}
+
+Status RunEndEncodedBuilder::FinishRun() {
+  if (length_ - run_start_ == 0) {
+    return Status::OK();
+  }
+  if (current_value_) {
+    RETURN_NOT_OK(value_builder().AppendScalar(*current_value_));
+  } else {
+    RETURN_NOT_OK(value_builder().AppendNull());
+  }
+  switch (type_->run_ends_type()->id()) {
+    case Type::INT16:
+      RETURN_NOT_OK(DoAppendRunEnd<int16_t>());
+      break;
+    case Type::INT32:
+      RETURN_NOT_OK(DoAppendRunEnd<int32_t>());
+      break;
+    case Type::INT64:
+      RETURN_NOT_OK(DoAppendRunEnd<int64_t>());
+      break;
+    default:
+      return Status::Invalid("Invalid type for run ends array: ", 
type_->run_ends_type());
+  }
+  current_value_.reset();
+  run_start_ = 0;
+  return Status::OK();
+}
+
+Status RunEndEncodedBuilder::ResizePhyiscal(int64_t capacity) {
+  RETURN_NOT_OK(value_builder().Resize(capacity));
+  return run_end_builder().Resize(capacity);

Review Comment:
   Why resize both builders with the same capacity? This doesn't seem to make 
sense.



##########
cpp/src/parquet/arrow/path_internal.cc:
##########
@@ -801,6 +801,10 @@ class PathBuilder {
     return Status::OK();
   }
 
+  Status Visit(const ::arrow::RunEndEncodedArray& array) {
+    return Status::NotImplemented("Arrow REE array in Parquet");

Review Comment:
   How about `NOT_IMPLEMENTED_VISIT` below?



##########
cpp/src/arrow/array/concatenate.cc:
##########
@@ -436,6 +438,23 @@ class ConcatenateImpl {
     return Status::OK();
   }
 
+  Status Visit(const RunEndEncodedType& type) {
+    int64_t physical_length = 0;
+    for (auto input : in_) {
+      physical_length =
+          SafeSignedAdd(physical_length, 
ree_util::FindPhysicalOffset(ArraySpan(*input)));
+    }
+    ARROW_ASSIGN_OR_RAISE(auto builder, MakeBuilder(in_[0]->type, pool_));
+    
RETURN_NOT_OK(internal::checked_cast<RunEndEncodedBuilder&>(*builder).ResizePhyiscal(
+        physical_length));
+    for (auto input : in_) {

Review Comment:
   ```suggestion
       for (const auto& input : in_) {
   ```



##########
cpp/src/arrow/ipc/writer.cc:
##########
@@ -526,6 +526,10 @@ class RecordBatchSerializer {
     return VisitType(*array.indices());
   }
 
+  Status Visit(const RunEndEncodedArray& type) {
+    return Status::NotImplemented("run-end encoded array in IPC");
+  }

Review Comment:
   I hope there will be a followup PR for this?



##########
cpp/src/arrow/array/concatenate.cc:
##########
@@ -30,6 +30,7 @@
 #include "arrow/array/data.h"
 #include "arrow/array/util.h"
 #include "arrow/buffer.h"
+#include "arrow/builder.h"

Review Comment:
   Just include the relevant builder header?



##########
cpp/src/arrow/array/builder_encoded.h:
##########
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_base.h"
+
+namespace arrow {
+
+/// \addtogroup encoded-builders
+///
+/// @{
+
+// ----------------------------------------------------------------------
+// RunEndEncoded builder
+
+class ARROW_EXPORT RunEndEncodedBuilder : public ArrayBuilder {
+ public:
+  RunEndEncodedBuilder(MemoryPool* pool,
+                       const std::shared_ptr<ArrayBuilder>& run_end_builder,
+                       const std::shared_ptr<ArrayBuilder>& value_builder,
+                       std::shared_ptr<DataType> type);
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) final;
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Resize(int64_t capacity) final;
+  void Reset() final;
+
+  Status Finish(std::shared_ptr<RunEndEncodedArray>* out) { return 
FinishTyped(out); }
+  Status AppendNull() final;
+  Status AppendNulls(int64_t length) final;
+  Status AppendEmptyValue() final;
+  Status AppendEmptyValues(int64_t length) final;
+  Status AppendScalar(const Scalar& scalar, int64_t n_repeats) final;
+  Status AppendScalars(const ScalarVector& scalars) final;
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t 
length) final;
+  std::shared_ptr<DataType> type() const final;
+
+  /// \brief Allocate enough memory for a given number of runs. Like Resize on 
non-REE
+  /// builders, it does not account for variable size data.

Review Comment:
   You should perhaps take a look at the Reserve methods exposed by e.g. List 
or Binary builders.



##########
cpp/src/arrow/array/concatenate.cc:
##########
@@ -436,6 +438,23 @@ class ConcatenateImpl {
     return Status::OK();
   }
 
+  Status Visit(const RunEndEncodedType& type) {
+    int64_t physical_length = 0;
+    for (auto input : in_) {
+      physical_length =
+          SafeSignedAdd(physical_length, 
ree_util::FindPhysicalOffset(ArraySpan(*input)));

Review Comment:
   Are we sure this cannot overflow? If it can, we should detect overflow 
instead of letting it wrap around.



##########
cpp/src/arrow/array/validate.cc:
##########
@@ -413,6 +414,20 @@ struct ValidateArrayImpl {
     return Status::OK();
   }
 
+  Status Visit(const RunEndEncodedType& type) {
+    switch (type.run_ends_type()->id()) {
+      case Type::INT16:
+        return ValidateRunEndEncoded<int16_t>(type);
+      case Type::INT32:
+        return ValidateRunEndEncoded<int32_t>(type);
+      case Type::INT64:
+        return ValidateRunEndEncoded<int64_t>(type);
+      default:
+        return Status::Invalid("Run ends type must be int16, int32 or int64, 
but got: ",
+                               type.run_ends_type());

Review Comment:
   Hmm, does it print out the `shared_ptr<DataType>`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to