lidavidm commented on code in PR #34195:
URL: https://github.com/apache/arrow/pull/34195#discussion_r1120278602


##########
cpp/src/arrow/compute/api_vector.h:
##########
@@ -577,6 +586,30 @@ Result<Datum> DictionaryEncode(
     const DictionaryEncodeOptions& options = 
DictionaryEncodeOptions::Defaults(),
     ExecContext* ctx = NULLPTR);
 
+/// \brief Run-end-encode values in an array-like object
+///
+/// \param[in] value array-like input
+/// \param[in] options configures encoding behavior
+/// \param[in] ctx the function execution context, optional
+/// \return result with same shape and type as input

Review Comment:
   same shape, different type, right?



##########
cpp/src/arrow/compute/api_vector.h:
##########
@@ -577,6 +586,30 @@ Result<Datum> DictionaryEncode(
     const DictionaryEncodeOptions& options = 
DictionaryEncodeOptions::Defaults(),
     ExecContext* ctx = NULLPTR);
 
+/// \brief Run-end-encode values in an array-like object
+///
+/// \param[in] value array-like input
+/// \param[in] options configures encoding behavior
+/// \param[in] ctx the function execution context, optional
+/// \return result with same shape and type as input
+///
+/// \since 12.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> RunEndEncode(const Datum& value, const RunEndEncodeOptions& 
options,
+                           ExecContext* ctx = NULLPTR);
+
+/// \brief Decode a Run-End Encoded array to a plain array
+///
+/// \param[in] value run-end-encoded input
+/// \param[in] ctx the function execution context, optional
+/// \return result with same shape and type as input

Review Comment:
   ditto



##########
cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc:
##########
@@ -0,0 +1,211 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/ree_util.h"
+
+namespace arrow {
+namespace compute {
+
+namespace {
+
+struct REETestData {
+  static REETestData JSON(std::shared_ptr<DataType> data_type, std::string 
input_json,
+                          std::string expected_values_json,
+                          std::string expected_run_ends_json, int64_t 
input_offset = 0) {
+    auto input_array = ArrayFromJSON(data_type, input_json);
+    REETestData result;
+    result.input = input_array->Slice(input_offset);
+    result.expected_values = ArrayFromJSON(data_type, expected_values_json);
+    result.expected_run_ends_json = std::move(expected_run_ends_json);
+    result.string = input_json;
+    return result;
+  }
+
+  static REETestData NullArray(int64_t input_length, int64_t input_offset = 0) 
{
+    auto input_array = std::make_shared<arrow::NullArray>(input_length);
+    REETestData result;
+    result.input = input_array->Slice(input_offset);
+    const int64_t input_slice_length = result.input->length();
+    result.expected_values =
+        std::make_shared<arrow::NullArray>(input_slice_length > 0 ? 1 : 0);
+    result.expected_run_ends_json =
+        input_slice_length > 0 ? "[" + std::to_string(input_slice_length) + 
"]" : "[]";
+    result.string = "[null * " + std::to_string(input_slice_length) + "]";
+    return result;
+  }
+
+  template <typename ArrowType>
+  static REETestData TypeMinMaxNull() {
+    using CType = typename ArrowType::c_type;
+    REETestData result;
+    NumericBuilder<ArrowType> builder;
+    ARROW_EXPECT_OK(builder.Append(std::numeric_limits<CType>::min()));
+    ARROW_EXPECT_OK(builder.AppendNull());
+    ARROW_EXPECT_OK(builder.Append(std::numeric_limits<CType>::max()));
+    result.input = builder.Finish().ValueOrDie();
+    result.expected_values = result.input;
+    result.expected_run_ends_json = "[1, 2, 3]";
+    result.string = "Type min, max, & null values";
+    return result;
+  }
+
+  std::shared_ptr<Array> input;
+  std::shared_ptr<Array> expected_values;
+  std::string expected_run_ends_json;
+  // only used for gtest output
+  std::string string;

Review Comment:
   nit: `string` isn't very descriptive, maybe `description` if an instance of 
this is meant to be a test case?



##########
cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc:
##########
@@ -0,0 +1,211 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/ree_util.h"
+
+namespace arrow {
+namespace compute {
+
+namespace {
+
+struct REETestData {
+  static REETestData JSON(std::shared_ptr<DataType> data_type, std::string 
input_json,
+                          std::string expected_values_json,
+                          std::string expected_run_ends_json, int64_t 
input_offset = 0) {
+    auto input_array = ArrayFromJSON(data_type, input_json);
+    REETestData result;
+    result.input = input_array->Slice(input_offset);
+    result.expected_values = ArrayFromJSON(data_type, expected_values_json);
+    result.expected_run_ends_json = std::move(expected_run_ends_json);
+    result.string = input_json;
+    return result;
+  }
+
+  static REETestData NullArray(int64_t input_length, int64_t input_offset = 0) 
{
+    auto input_array = std::make_shared<arrow::NullArray>(input_length);
+    REETestData result;
+    result.input = input_array->Slice(input_offset);
+    const int64_t input_slice_length = result.input->length();
+    result.expected_values =
+        std::make_shared<arrow::NullArray>(input_slice_length > 0 ? 1 : 0);
+    result.expected_run_ends_json =
+        input_slice_length > 0 ? "[" + std::to_string(input_slice_length) + 
"]" : "[]";
+    result.string = "[null * " + std::to_string(input_slice_length) + "]";
+    return result;
+  }
+
+  template <typename ArrowType>
+  static REETestData TypeMinMaxNull() {
+    using CType = typename ArrowType::c_type;
+    REETestData result;
+    NumericBuilder<ArrowType> builder;
+    ARROW_EXPECT_OK(builder.Append(std::numeric_limits<CType>::min()));
+    ARROW_EXPECT_OK(builder.AppendNull());
+    ARROW_EXPECT_OK(builder.Append(std::numeric_limits<CType>::max()));
+    result.input = builder.Finish().ValueOrDie();
+    result.expected_values = result.input;
+    result.expected_run_ends_json = "[1, 2, 3]";
+    result.string = "Type min, max, & null values";
+    return result;
+  }
+
+  std::shared_ptr<Array> input;
+  std::shared_ptr<Array> expected_values;
+  std::string expected_run_ends_json;
+  // only used for gtest output
+  std::string string;
+};
+
+}  // namespace
+
+class TestRunEndEncodeDecode : public ::testing::TestWithParam<
+                                   std::tuple<REETestData, 
std::shared_ptr<DataType>>> {
+ public:
+  void AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset) {
+    auto& child = array->child_data[1];
+    auto builder = MakeBuilder(child->type).ValueOrDie();
+    ARROW_CHECK_OK(builder->AppendNulls(offset));
+    ARROW_CHECK_OK(builder->AppendArraySlice(ArraySpan(*child), 0, 
child->length));
+    array->child_data[1] = 
builder->Finish().ValueOrDie()->Slice(offset)->data();
+  }
+};
+
+TEST_P(TestRunEndEncodeDecode, EncodeDecodeArray) {
+  auto [data, run_ends_type] = GetParam();
+
+  ASSERT_OK_AND_ASSIGN(Datum encoded_datum,
+                       RunEndEncode(data.input, 
RunEndEncodeOptions(run_ends_type)));
+
+  auto encoded = encoded_datum.array();
+  auto run_ends_array = MakeArray(encoded->child_data[0]);
+  auto values_array = MakeArray(encoded->child_data[1]);
+  ASSERT_OK(MakeArray(encoded)->ValidateFull());
+  ASSERT_ARRAYS_EQUAL(*ArrayFromJSON(run_ends_type, 
data.expected_run_ends_json),
+                      *run_ends_array);
+  ASSERT_ARRAYS_EQUAL(*values_array, *data.expected_values);
+  ASSERT_EQ(encoded->buffers.size(), 1);
+  ASSERT_EQ(encoded->buffers[0], NULLPTR);
+  ASSERT_EQ(encoded->child_data.size(), 2);
+  ASSERT_EQ(run_ends_array->data()->buffers[0], NULLPTR);
+  ASSERT_EQ(run_ends_array->length(), data.expected_values->length());
+  ASSERT_EQ(run_ends_array->offset(), 0);
+  ASSERT_EQ(encoded->length, data.input->length());
+  ASSERT_EQ(encoded->offset, 0);
+  ASSERT_EQ(*encoded->type, RunEndEncodedType(run_ends_type, 
data.input->type()));
+  ASSERT_EQ(encoded->null_count, 0);
+
+  ASSERT_OK_AND_ASSIGN(Datum decoded_datum, RunEndDecode(encoded));
+  auto decoded = decoded_datum.make_array();
+  ASSERT_OK(decoded->ValidateFull());
+  ASSERT_ARRAYS_EQUAL(*decoded, *data.input);
+}
+
+// Encoding an input with an offset results in a completely new encoded array 
without an
+// offset. This means The EncodeDecodeArray test will never actually decode an 
array
+// with an offset, even though we have inputs with offsets. This test slices 
one element
+// off the encoded array and decodes that.
+TEST_P(TestRunEndEncodeDecode, DecodeWithOffset) {
+  auto [data, run_ends_type] = GetParam();
+  if (data.input->length() == 0) {
+    return;
+  }
+
+  ASSERT_OK_AND_ASSIGN(Datum encoded_datum,
+                       RunEndEncode(data.input, 
RunEndEncodeOptions(run_ends_type)));
+
+  auto encoded = encoded_datum.array();
+  ASSERT_OK_AND_ASSIGN(Datum datum_without_first,
+                       RunEndDecode(encoded->Slice(1, encoded->length - 1)));
+  ASSERT_OK_AND_ASSIGN(Datum datum_without_last,
+                       RunEndDecode(encoded->Slice(0, encoded->length - 1)));
+  auto array_without_first = datum_without_first.make_array();
+  auto array_without_last = datum_without_last.make_array();
+  ASSERT_OK(array_without_first->ValidateFull());
+  ASSERT_OK(array_without_last->ValidateFull());
+  ASSERT_ARRAYS_EQUAL(*array_without_first, *data.input->Slice(1));
+  ASSERT_ARRAYS_EQUAL(*array_without_last,
+                      *data.input->Slice(0, data.input->length() - 1));
+}
+
+// This test creates an run-end encoded array with an offset in the child 
array, which
+// removes the first run in the test data.
+TEST_P(TestRunEndEncodeDecode, DecodeWithOffsetInChildArray) {
+  auto [data, run_ends_type] = GetParam();
+
+  ASSERT_OK_AND_ASSIGN(Datum encoded_datum,
+                       RunEndEncode(data.input, 
RunEndEncodeOptions(run_ends_type)));
+
+  auto encoded = encoded_datum.array();
+  this->AddArtificialOffsetInChildArray(encoded.get(), 100);
+  ASSERT_OK_AND_ASSIGN(Datum datum_without_first, RunEndDecode(encoded));
+  auto array_without_first = datum_without_first.make_array();
+  ASSERT_OK(array_without_first->ValidateFull());
+  ASSERT_ARRAYS_EQUAL(*array_without_first, *data.input);
+}
+
+INSTANTIATE_TEST_SUITE_P(

Review Comment:
   Is it possible to test scalars and chunked arrays here, too?



##########
cpp/src/arrow/compute/kernels/vector_run_end_encode.cc:
##########
@@ -0,0 +1,654 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <utility>
+
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/kernels/common_internal.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/ree_util.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+template <typename ArrowType, bool has_validity_buffer>
+struct ReadValueImpl {
+  using CType = typename ArrowType::c_type;
+
+  [[nodiscard]] bool ReadValue(const uint8_t* input_validity, const void* 
input_values,
+                               CType* out, int64_t read_offset) const {
+    bool valid = true;
+    if constexpr (has_validity_buffer) {
+      valid = bit_util::GetBit(input_validity, read_offset);
+    }
+    if (valid) {
+      *out = (reinterpret_cast<const CType*>(input_values))[read_offset];
+    }
+    return valid;
+  }
+};
+
+template <>
+bool ReadValueImpl<BooleanType, true>::ReadValue(const uint8_t* input_validity,
+                                                 const void* input_values, 
CType* out,
+                                                 int64_t read_offset) const {
+  const bool valid = bit_util::GetBit(input_validity, read_offset);
+  *out = valid &&
+         bit_util::GetBit(reinterpret_cast<const uint8_t*>(input_values), 
read_offset);
+  return valid;
+}
+
+template <>
+bool ReadValueImpl<BooleanType, false>::ReadValue(const uint8_t* 
input_validity,
+                                                  const void* input_values, 
CType* out,
+                                                  int64_t read_offset) const {
+  *out = bit_util::GetBit(reinterpret_cast<const uint8_t*>(input_values), 
read_offset);
+  return true;
+}
+
+template <typename ArrowType, bool has_validity_buffer>
+struct WriteValueImpl {

Review Comment:
   Is there any possibility to consolidate with 
https://github.com/apache/arrow/blob/f74d5285803c23e6f553d655ea5f184d06d26607/cpp/src/arrow/compute/kernels/copy_data_internal.h



##########
cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc:
##########
@@ -0,0 +1,211 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/ree_util.h"
+
+namespace arrow {
+namespace compute {
+
+namespace {
+
+struct REETestData {
+  static REETestData JSON(std::shared_ptr<DataType> data_type, std::string 
input_json,
+                          std::string expected_values_json,
+                          std::string expected_run_ends_json, int64_t 
input_offset = 0) {
+    auto input_array = ArrayFromJSON(data_type, input_json);
+    REETestData result;
+    result.input = input_array->Slice(input_offset);
+    result.expected_values = ArrayFromJSON(data_type, expected_values_json);
+    result.expected_run_ends_json = std::move(expected_run_ends_json);
+    result.string = input_json;
+    return result;
+  }
+
+  static REETestData NullArray(int64_t input_length, int64_t input_offset = 0) 
{
+    auto input_array = std::make_shared<arrow::NullArray>(input_length);
+    REETestData result;
+    result.input = input_array->Slice(input_offset);
+    const int64_t input_slice_length = result.input->length();
+    result.expected_values =
+        std::make_shared<arrow::NullArray>(input_slice_length > 0 ? 1 : 0);
+    result.expected_run_ends_json =
+        input_slice_length > 0 ? "[" + std::to_string(input_slice_length) + 
"]" : "[]";
+    result.string = "[null * " + std::to_string(input_slice_length) + "]";
+    return result;
+  }
+
+  template <typename ArrowType>
+  static REETestData TypeMinMaxNull() {
+    using CType = typename ArrowType::c_type;
+    REETestData result;
+    NumericBuilder<ArrowType> builder;
+    ARROW_EXPECT_OK(builder.Append(std::numeric_limits<CType>::min()));
+    ARROW_EXPECT_OK(builder.AppendNull());
+    ARROW_EXPECT_OK(builder.Append(std::numeric_limits<CType>::max()));
+    result.input = builder.Finish().ValueOrDie();
+    result.expected_values = result.input;
+    result.expected_run_ends_json = "[1, 2, 3]";
+    result.string = "Type min, max, & null values";
+    return result;
+  }
+
+  std::shared_ptr<Array> input;
+  std::shared_ptr<Array> expected_values;
+  std::string expected_run_ends_json;
+  // only used for gtest output
+  std::string string;
+};
+
+}  // namespace
+
+class TestRunEndEncodeDecode : public ::testing::TestWithParam<
+                                   std::tuple<REETestData, 
std::shared_ptr<DataType>>> {
+ public:
+  void AddArtificialOffsetInChildArray(ArrayData* array, int64_t offset) {
+    auto& child = array->child_data[1];
+    auto builder = MakeBuilder(child->type).ValueOrDie();
+    ARROW_CHECK_OK(builder->AppendNulls(offset));

Review Comment:
   You can use ASSERT_OK and ASSERT_OK_AND_ASSIGN
   
   Then you can call this as 
`ASSERT_NO_FATAL_FAILURE(AddArtificialOffsetInChildArray(...))`



##########
cpp/src/arrow/compute/kernels/vector_run_end_encode.cc:
##########
@@ -0,0 +1,654 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <utility>
+
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/kernels/common_internal.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/ree_util.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+template <typename ArrowType, bool has_validity_buffer>
+struct ReadValueImpl {

Review Comment:
   Is it possible to use one of the existing typed visitors instead of a 
handrolled loop?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to