drin commented on code in PR #13330:
URL: https://github.com/apache/arrow/pull/13330#discussion_r944925919
##########
cpp/src/arrow/compute/api_vector.h:
##########
@@ -580,6 +587,26 @@ Result<Datum> DictionaryEncode(
const DictionaryEncodeOptions& options =
DictionaryEncodeOptions::Defaults(),
ExecContext* ctx = NULLPTR);
+/// \brief Run-Length-encode values in an array-like object
+/// \param[in] value array-like input
+/// \param[in] ctx the function execution context, optional
+/// \return result with same shape and type as input
+///
+/// \since 9.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> RunLengthEncode(const Datum& value, ExecContext* ctx = NULLPTR);
Review Comment:
Should there also be a version that takes a `RunLengthEncodeOptions`?
##########
cpp/src/arrow/array/data.cc:
##########
@@ -393,6 +408,16 @@ int64_t ArraySpan::GetNullCount() const {
return precomputed;
}
+bool ArraySpan::MayHaveNulls() const {
+ if (type->id() == Type::RUN_LENGTH_ENCODED) {
+ return child_data[0].MayHaveNulls();
+ } else {
+ // If an ArrayData is slightly malformed it may have kUnknownNullCount set
Review Comment:
```suggestion
// If an ArraySpan is slightly malformed it may have kUnknownNullCount
set
```
##########
cpp/src/arrow/compute/kernel.cc:
##########
@@ -275,6 +275,41 @@ std::shared_ptr<TypeMatcher> FixedSizeBinaryLike() {
return std::make_shared<FixedSizeBinaryLikeMatcher>();
}
+class RunLengthEncodedMatcher : public TypeMatcher {
+ public:
+ RunLengthEncodedMatcher(std::shared_ptr<TypeMatcher> encoded_type_matcher)
+ : encoded_type_matcher{std::move(encoded_type_matcher)} {}
+
+ bool Matches(const DataType& type) const override {
+ if (type.id() == Type::RUN_LENGTH_ENCODED) {
+ auto& encoding_type = dynamic_cast<const EncodingType&>(type);
+ return encoded_type_matcher->Matches(*encoding_type.encoded_type());
+ } else {
+ return false;
+ }
Review Comment:
```suggestion
if (type.id() != Type::RUN_LENGTH_ENCODED) { return false; }
auto& encoding_type = dynamic_cast<const EncodingType&>(type);
return encoded_type_matcher->Matches(*encoding_type.encoded_type());
```
optional suggestion, but I think it speeds up readability
##########
cpp/src/arrow/util/rle_util.h:
##########
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/array/data.h"
+
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace rle_util {
+
+struct Metadata {
+ int64_t physical_offset;
+ int64_t physical_length;
+};
+
+int64_t FindPhysicalOffset(const int32_t* accumulated_run_lengths,
+ int64_t physical_length, int64_t logical_offset);
+
+int64_t FindPhysicalIndex(const int32_t* accumulated_run_lengths,
+ int64_t physical_length, int64_t physical_offset,
int64_t logical_offset, int64_t logical_index);
Review Comment:
can `accumulated_run_lengths` be shortened to `run_lengths`? Otherwise, I'm
not sure what the accumulated portion should signify.
It would also be helpful to know what the difference is between offsets and
indices. Based on the implementations in `rle_util.cc`, it seems like
`FindPhysicalOffset` could be implemented as:
```c
FindPhysicalIndex(run_lengths, physical_length, 0, logical_offset, 0);
```
so, it's not clear to me what the different naming signifies (except maybe
that a physical offset and logical index is/isn't provided)
##########
cpp/src/arrow/util/rle_util.cc:
##########
@@ -0,0 +1,32 @@
+#include "arrow/util/rle_util.h"
+#include <algorithm>
+#include "arrow/builder.h"
+
+namespace arrow {
+namespace rle_util {
+
+int64_t FindPhysicalOffset(const int32_t* accumulated_run_lengths,
+ int64_t physical_length, int64_t logical_offset) {
+ auto it = std::upper_bound(accumulated_run_lengths,
+ accumulated_run_lengths + physical_length,
logical_offset);
+ return std::distance(accumulated_run_lengths, it);
+}
+
+int64_t FindPhysicalIndex(const int32_t* accumulated_run_lengths,
+ int64_t physical_length, int64_t physical_offset,
int64_t logical_offset, int64_t logical_index) {
+ auto it = std::upper_bound(accumulated_run_lengths + physical_offset,
+ accumulated_run_lengths + physical_offset +
physical_length,
+ logical_index + logical_offset);
Review Comment:
I don't understand the difference between logical index and logical offset
based on the provided tests. Or, more specifically, they seem to share the same
domain (values in the `run_lengths` list).
##########
cpp/src/arrow/compute/kernel.cc:
##########
@@ -275,6 +275,41 @@ std::shared_ptr<TypeMatcher> FixedSizeBinaryLike() {
return std::make_shared<FixedSizeBinaryLikeMatcher>();
}
+class RunLengthEncodedMatcher : public TypeMatcher {
+ public:
+ RunLengthEncodedMatcher(std::shared_ptr<TypeMatcher> encoded_type_matcher)
+ : encoded_type_matcher{std::move(encoded_type_matcher)} {}
+
+ bool Matches(const DataType& type) const override {
+ if (type.id() == Type::RUN_LENGTH_ENCODED) {
+ auto& encoding_type = dynamic_cast<const EncodingType&>(type);
+ return encoded_type_matcher->Matches(*encoding_type.encoded_type());
+ } else {
+ return false;
+ }
+ }
+
+ bool Equals(const TypeMatcher& other) const override {
+ if (this == &other) {
+ return true;
+ }
+ auto casted = dynamic_cast<const RunLengthEncodedMatcher*>(&other);
+ return casted != nullptr;
+ }
+
+ std::string ToString() const override {
+ return "run_length_encoded(" + encoded_type_matcher->ToString() + ")";
+ };
+
+ private:
+ std::shared_ptr<TypeMatcher> encoded_type_matcher;
Review Comment:
I don't know the surrounding context, so I'm just curious if this could be a
unique_ptr instead and if there is benefit if it were?
##########
cpp/src/arrow/util/rle_util.cc:
##########
@@ -0,0 +1,32 @@
+#include "arrow/util/rle_util.h"
+#include <algorithm>
+#include "arrow/builder.h"
+
+namespace arrow {
+namespace rle_util {
+
+int64_t FindPhysicalOffset(const int32_t* accumulated_run_lengths,
+ int64_t physical_length, int64_t logical_offset) {
+ auto it = std::upper_bound(accumulated_run_lengths,
+ accumulated_run_lengths + physical_length,
logical_offset);
+ return std::distance(accumulated_run_lengths, it);
+}
+
+int64_t FindPhysicalIndex(const int32_t* accumulated_run_lengths,
+ int64_t physical_length, int64_t physical_offset,
int64_t logical_offset, int64_t logical_index) {
+ auto it = std::upper_bound(accumulated_run_lengths + physical_offset,
+ accumulated_run_lengths + physical_offset +
physical_length,
+ logical_index + logical_offset);
+ return std::distance(accumulated_run_lengths, it);
Review Comment:
should this accommodate `physical_offset`?
##########
cpp/src/arrow/compute/kernels/vector_run_length_encode_test.cc:
##########
@@ -0,0 +1,192 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/rle_util.h"
+
+namespace arrow {
+namespace compute {
+
+struct RLETestData {
+ static RLETestData JSON(std::shared_ptr<DataType> data_type, std::string
input_json,
+ std::string expected_values_json,
+ std::vector<int32_t> expected_run_lengths,
+ int64_t input_offset = 0) {
+ auto input_array = ArrayFromJSON(data_type, input_json);
+ return {.input = input_array->Slice(input_offset),
+ .expected_values = ArrayFromJSON(data_type, expected_values_json),
+ .expected_run_lengths = std::move(expected_run_lengths),
+ .string = input_json};
+ }
+
+ template <typename ArrowType>
+ static RLETestData TypeMinMaxNull() {
+ using CType = typename ArrowType::c_type;
+ RLETestData result;
+ NumericBuilder<ArrowType> builder;
+ ARROW_EXPECT_OK(builder.Append(std::numeric_limits<CType>::min()));
+ ARROW_EXPECT_OK(builder.AppendNull());
+ ARROW_EXPECT_OK(builder.Append(std::numeric_limits<CType>::max()));
+ result.input = builder.Finish().ValueOrDie();
+ result.expected_values = result.input;
+ result.expected_run_lengths = {1, 2, 3};
+ result.string = "Type min, max, & null values";
+ return result;
+ }
+
+ std::shared_ptr<Array> input;
+ std::shared_ptr<Array> expected_values;
+ std::vector<int32_t> expected_run_lengths;
Review Comment:
When I read this variable, I imagine it to contain the length of an encoded
value--it's "run length". But, if I understand the tests correctly, the
elements in this vector are more like offsets or positions of the last position
of an encoded value.
This is kind of a nitpick, but maybe instead of "run_lengths" it can be
changed to "run_length_offsets" or something a bit clearer. A clarifying
comment might also be sufficient.
##########
cpp/src/arrow/util/rle_util_test.cc:
##########
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/rle_util.h"
+
+namespace arrow {
+namespace rle_util {
+
+TEST(TestRleUtil, FindPhysicalOffsetTest) {
+ ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1}, 1, 0), 0);
+ ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3, 0), 0);
+ ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3, 1), 1);
+ ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3, 2), 2);
+ ASSERT_EQ(FindPhysicalOffset((const int32_t[]){1, 2, 3}, 3, 3), 3);
+ ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3, 0), 0);
+ ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3, 1), 0);
+ ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3, 2), 1);
+ ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 3, 4}, 3, 3), 2);
+ ASSERT_EQ(FindPhysicalOffset((const int32_t[]){2, 4, 6}, 3, 3), 1);
+}
+
+TEST(TestRleUtil, FindPhysicalIndexTest) {
+ ASSERT_EQ(FindPhysicalIndex((const int32_t[]){1, 10, 20, 30, 40}, 5, 2, 15,
14), 3);
+ ASSERT_EQ(FindPhysicalIndex((const int32_t[]){1, 10, 20, 30, 40}, 5, 2, 15,
15), 4);
Review Comment:
it occurs to me that the expected values here are relative to the whole RLE
list, rather than relative to the physical offset provided. I am not sure if
that is intended or not?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]