drin commented on code in PR #13330: URL: https://github.com/apache/arrow/pull/13330#discussion_r952039138
########## cpp/src/arrow/array/array_encoded_test.cc: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <cstdint> +#include <cstring> +#include <memory> +#include <vector> + +#include "arrow/array.h" +#include "arrow/array/builder_nested.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// Run-length encoded array tests + +namespace { + +auto string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); +auto int32_values = ArrayFromJSON(int32(), "[10, 20, 30]"); +auto int32_only_null = ArrayFromJSON(int32(), "[null, null, null]"); + +TEST(RunLengthEncodedArray, MakeArray) { + ASSERT_OK_AND_ASSIGN(auto rle_array, + RunLengthEncodedArray::Make(int32_values, string_values, 3)); + auto array_data = rle_array->data(); + auto new_array = MakeArray(array_data); + ASSERT_ARRAYS_EQUAL(*new_array, *rle_array); + // should be the exact same ArrayData object + ASSERT_EQ(new_array->data(), array_data); + ASSERT_NE(std::dynamic_pointer_cast<RunLengthEncodedArray>(new_array), NULLPTR); +} + +TEST(RunLengthEncodedArray, FromRunEndsAndValues) { + std::shared_ptr<RunLengthEncodedArray> rle_array; + + ASSERT_OK_AND_ASSIGN(rle_array, + RunLengthEncodedArray::Make(int32_values, int32_values, 3)); + ASSERT_EQ(rle_array->length(), 3); + ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *int32_values); + ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *int32_values); + ASSERT_EQ(rle_array->offset(), 0); + ASSERT_EQ(rle_array->data()->null_count, 0); + // one dummy buffer, since code may assume there is exactly one buffer + ASSERT_EQ(rle_array->data()->buffers.size(), 1); Review Comment: ```suggestion // validity buffer must exist and be empty because each RLE array entries must be valid ASSERT_EQ(rle_array->data()->buffers.size(), 1); ASSERT_EQ(rle_array->data()->buffers[0], NULLPTR); ``` I think this would be easier for newer developers to understand the semantics of the RLE array. (Also, if this is incorrect then I probably need to re-assess my review so far lol) ########## cpp/src/arrow/array/array_encoded_test.cc: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <cstdint> +#include <cstring> +#include <memory> +#include <vector> + +#include "arrow/array.h" +#include "arrow/array/builder_nested.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// Run-length encoded array tests + +namespace { + +auto string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); +auto int32_values = ArrayFromJSON(int32(), "[10, 20, 30]"); +auto int32_only_null = ArrayFromJSON(int32(), "[null, null, null]"); + +TEST(RunLengthEncodedArray, MakeArray) { + ASSERT_OK_AND_ASSIGN(auto rle_array, + RunLengthEncodedArray::Make(int32_values, string_values, 3)); + auto array_data = rle_array->data(); + auto new_array = MakeArray(array_data); + ASSERT_ARRAYS_EQUAL(*new_array, *rle_array); + // should be the exact same ArrayData object Review Comment: Maybe we should also add a mutation to one array and assert that they're not equal? If the intent of this test is just to have coverage of MakeArray, then it's probably fine. If we want to be sure that `MakeArray` does a proper copy (which I assume is the intended semantics of `MakeArray`), then the additional mutation and assertion could be useful. ########## cpp/src/arrow/compute/kernels/vector_run_length_encode.cc: ########## @@ -0,0 +1,393 @@ +#include "arrow/compute/api_vector.h" +#include "arrow/compute/kernels/common.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/rle_util.h" + +namespace arrow { +namespace compute { +namespace internal { + +template <typename ArrowType, bool has_validity_buffer> +struct EncodeDecodeCommonExec { + using CType = typename ArrowType::c_type; + + struct Element { + bool valid; + CType value; + + bool operator!=(const Element& other) const { + return valid != other.valid || value != other.value; + } + }; + + EncodeDecodeCommonExec(KernelContext* kernel_context, const ExecSpan& span, + ExecResult* result) + : kernel_context{kernel_context}, + input_array{span.values[0].array}, + exec_result{result} { + ARROW_DCHECK(span.num_values() == 1); + } + + Element ReadValue() { + Element result; + if (has_validity_buffer) { + result.valid = bit_util::GetBit(input_validity, read_offset); + } else { + result.valid = true; + } + result.value = (reinterpret_cast<const CType*>(input_values))[read_offset]; + return result; + } + + void WriteValue(Element element) { + if (has_validity_buffer) { + bit_util::SetBitsTo(output_validity, write_offset, 1, element.valid); + } + (reinterpret_cast<CType*>(output_values))[write_offset] = element.value; + } + + KernelContext* kernel_context; + const ArraySpan input_array; + ExecResult* exec_result; + const uint8_t* input_validity; + const void* input_values; + uint8_t* output_validity; + void* output_values; + // read offset is a physical index into the values buffer, including array offsets + int64_t read_offset; + int64_t write_offset; +}; + +template <> +EncodeDecodeCommonExec<BooleanType, true>::Element +EncodeDecodeCommonExec<BooleanType, true>::ReadValue() { + Element result; + result.valid = bit_util::GetBit(input_validity, read_offset); + if (result.valid) { + result.value = + bit_util::GetBit(reinterpret_cast<const uint8_t*>(input_values), read_offset); + } + return result; +} + +template <> +EncodeDecodeCommonExec<BooleanType, false>::Element +EncodeDecodeCommonExec<BooleanType, false>::ReadValue() { + return { + .valid = true, + .value = + bit_util::GetBit(reinterpret_cast<const uint8_t*>(input_values), read_offset), + }; +} + +template <> +void EncodeDecodeCommonExec<BooleanType, true>::WriteValue(Element element) { + bit_util::SetBitTo(output_validity, write_offset, element.valid); + if (element.valid) { + bit_util::SetBitTo(reinterpret_cast<uint8_t*>(output_values), write_offset, + element.value); + } +} + +template <> +void EncodeDecodeCommonExec<BooleanType, false>::WriteValue(Element element) { + bit_util::SetBitTo(reinterpret_cast<uint8_t*>(output_values), write_offset, + element.value); +} + +template <typename ArrowType, bool has_validity_buffer> +struct RunLengthEncodeExec + : public EncodeDecodeCommonExec<ArrowType, has_validity_buffer> { + using EncodeDecodeCommonExec<ArrowType, has_validity_buffer>::EncodeDecodeCommonExec; + using typename EncodeDecodeCommonExec<ArrowType, has_validity_buffer>::CType; + using typename EncodeDecodeCommonExec<ArrowType, has_validity_buffer>::Element; + + Status Exec() { + ArrayData* output_array_data = this->exec_result->array_data().get(); + if (this->input_array.length == 0) { + output_array_data->length = 0; + output_array_data->offset = 0; + output_array_data->buffers = {NULLPTR}; + output_array_data->child_data[0] = + ArrayData::Make(this->input_array.type->GetSharedPtr(), + /*length =*/0, + /*buffers =*/{NULLPTR, NULLPTR}); + return Status::OK(); + } + if (this->input_array.length > std::numeric_limits<int32_t>::max()) { + return Status::Invalid( + "run-length encoded arrays can only have a number of elements that can be " + "represented as a 32-bit signed integer"); + } + this->input_validity = this->input_array.buffers[0].data; + this->input_values = this->input_array.buffers[1].data; + int64_t input_offset = this->input_array.offset; + + this->read_offset = input_offset; + Element element = this->ReadValue(); + int64_t num_values_output = 1; + + // calculate input null count by ourselves. The input span likely got sliced by an + // ExecSpanIterator using SetOffset right before this code executes. SetOffset leaves + // the null_count value of the ArraySpan as kUnknownNullCount. Review Comment: is this because slice doesn't adjust the null count (to reduce overhead)? ########## cpp/src/arrow/array/array_encoded.cc: ########## @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array/array_encoded.h" +#include "arrow/array/util.h" +#include "arrow/util/logging.h" +#include "arrow/util/rle_util.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// RunLengthEncodedArray + +RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr<ArrayData>& data) { + ARROW_CHECK_EQ(data->type->id(), Type::RUN_LENGTH_ENCODED); + SetData(data); +} + +RunLengthEncodedArray::RunLengthEncodedArray(const std::shared_ptr<DataType>& type, + int64_t length, + const std::shared_ptr<Array>& run_ends_array, + const std::shared_ptr<Array>& values_array, + int64_t offset) { + ARROW_CHECK_EQ(type->id(), Type::RUN_LENGTH_ENCODED); + SetData(ArrayData::Make(type, length, {NULLPTR}, 0, offset)); + data_->child_data.push_back(std::move(run_ends_array->data())); + data_->child_data.push_back(std::move(values_array->data())); Review Comment: okay, I think I figured it out, but just to confirm: this approach allows you to keep binary data contiguous (which you need offsets for), which prevents you from reasonably holding run ends and run values in a single ArrayData? ########## cpp/src/arrow/array/array_encoded_test.cc: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <cstdint> +#include <cstring> +#include <memory> +#include <vector> + +#include "arrow/array.h" +#include "arrow/array/builder_nested.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// Run-length encoded array tests + +namespace { + +auto string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); +auto int32_values = ArrayFromJSON(int32(), "[10, 20, 30]"); +auto int32_only_null = ArrayFromJSON(int32(), "[null, null, null]"); + +TEST(RunLengthEncodedArray, MakeArray) { + ASSERT_OK_AND_ASSIGN(auto rle_array, + RunLengthEncodedArray::Make(int32_values, string_values, 3)); + auto array_data = rle_array->data(); + auto new_array = MakeArray(array_data); + ASSERT_ARRAYS_EQUAL(*new_array, *rle_array); + // should be the exact same ArrayData object + ASSERT_EQ(new_array->data(), array_data); + ASSERT_NE(std::dynamic_pointer_cast<RunLengthEncodedArray>(new_array), NULLPTR); Review Comment: I'm not sure if this tests some detailed semantics, but I can't think of a scenario where this fails but the previous assertion would succeed. If `Make` is `OK` and `MakeArray` returns a pointer to the same data, won't this always succeed? ########## cpp/src/arrow/compute/kernels/vector_run_length_encode.cc: ########## @@ -0,0 +1,393 @@ +#include "arrow/compute/api_vector.h" +#include "arrow/compute/kernels/common.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/rle_util.h" + +namespace arrow { +namespace compute { +namespace internal { + +template <typename ArrowType, bool has_validity_buffer> +struct EncodeDecodeCommonExec { + using CType = typename ArrowType::c_type; + + struct Element { + bool valid; + CType value; + + bool operator!=(const Element& other) const { + return valid != other.valid || value != other.value; + } + }; + + EncodeDecodeCommonExec(KernelContext* kernel_context, const ExecSpan& span, + ExecResult* result) + : kernel_context{kernel_context}, + input_array{span.values[0].array}, + exec_result{result} { + ARROW_DCHECK(span.num_values() == 1); + } + + Element ReadValue() { + Element result; + if (has_validity_buffer) { + result.valid = bit_util::GetBit(input_validity, read_offset); + } else { + result.valid = true; + } + result.value = (reinterpret_cast<const CType*>(input_values))[read_offset]; + return result; + } + + void WriteValue(Element element) { + if (has_validity_buffer) { + bit_util::SetBitsTo(output_validity, write_offset, 1, element.valid); + } + (reinterpret_cast<CType*>(output_values))[write_offset] = element.value; + } + + KernelContext* kernel_context; + const ArraySpan input_array; + ExecResult* exec_result; + const uint8_t* input_validity; + const void* input_values; + uint8_t* output_validity; + void* output_values; + // read offset is a physical index into the values buffer, including array offsets + int64_t read_offset; + int64_t write_offset; +}; + +template <> +EncodeDecodeCommonExec<BooleanType, true>::Element +EncodeDecodeCommonExec<BooleanType, true>::ReadValue() { + Element result; + result.valid = bit_util::GetBit(input_validity, read_offset); + if (result.valid) { + result.value = + bit_util::GetBit(reinterpret_cast<const uint8_t*>(input_values), read_offset); + } + return result; +} + +template <> +EncodeDecodeCommonExec<BooleanType, false>::Element +EncodeDecodeCommonExec<BooleanType, false>::ReadValue() { + return { + .valid = true, + .value = + bit_util::GetBit(reinterpret_cast<const uint8_t*>(input_values), read_offset), + }; +} + +template <> +void EncodeDecodeCommonExec<BooleanType, true>::WriteValue(Element element) { + bit_util::SetBitTo(output_validity, write_offset, element.valid); + if (element.valid) { + bit_util::SetBitTo(reinterpret_cast<uint8_t*>(output_values), write_offset, + element.value); + } +} + +template <> +void EncodeDecodeCommonExec<BooleanType, false>::WriteValue(Element element) { + bit_util::SetBitTo(reinterpret_cast<uint8_t*>(output_values), write_offset, + element.value); +} + +template <typename ArrowType, bool has_validity_buffer> +struct RunLengthEncodeExec + : public EncodeDecodeCommonExec<ArrowType, has_validity_buffer> { + using EncodeDecodeCommonExec<ArrowType, has_validity_buffer>::EncodeDecodeCommonExec; + using typename EncodeDecodeCommonExec<ArrowType, has_validity_buffer>::CType; + using typename EncodeDecodeCommonExec<ArrowType, has_validity_buffer>::Element; + + Status Exec() { + ArrayData* output_array_data = this->exec_result->array_data().get(); Review Comment: is `this->exec_result->array_data().get()` guaranteed to be valid/pre-allocated? Looking at `VectorExecutor` and `ScalarExecutor` it does seem so, but I just wanted to confirm. (I actually have code that assumes it isn't, so I probably should revisit that) ########## cpp/src/arrow/array/array_encoded_test.cc: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <cstdint> +#include <cstring> +#include <memory> +#include <vector> + +#include "arrow/array.h" +#include "arrow/array/builder_nested.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// Run-length encoded array tests + +namespace { + +auto string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); +auto int32_values = ArrayFromJSON(int32(), "[10, 20, 30]"); +auto int32_only_null = ArrayFromJSON(int32(), "[null, null, null]"); + +TEST(RunLengthEncodedArray, MakeArray) { + ASSERT_OK_AND_ASSIGN(auto rle_array, + RunLengthEncodedArray::Make(int32_values, string_values, 3)); + auto array_data = rle_array->data(); + auto new_array = MakeArray(array_data); + ASSERT_ARRAYS_EQUAL(*new_array, *rle_array); + // should be the exact same ArrayData object + ASSERT_EQ(new_array->data(), array_data); + ASSERT_NE(std::dynamic_pointer_cast<RunLengthEncodedArray>(new_array), NULLPTR); +} + +TEST(RunLengthEncodedArray, FromRunEndsAndValues) { + std::shared_ptr<RunLengthEncodedArray> rle_array; + + ASSERT_OK_AND_ASSIGN(rle_array, + RunLengthEncodedArray::Make(int32_values, int32_values, 3)); + ASSERT_EQ(rle_array->length(), 3); + ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *int32_values); + ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *int32_values); + ASSERT_EQ(rle_array->offset(), 0); + ASSERT_EQ(rle_array->data()->null_count, 0); + // one dummy buffer, since code may assume there is exactly one buffer + ASSERT_EQ(rle_array->data()->buffers.size(), 1); + + // explicitly passing offset + ASSERT_OK_AND_ASSIGN(rle_array, + RunLengthEncodedArray::Make(int32_values, string_values, 2, 1)); + ASSERT_EQ(rle_array->length(), 2); + ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *string_values); + ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *int32_values); + ASSERT_EQ(rle_array->offset(), 1); + // explicitly access null count variable so it is not calculated automatically + ASSERT_EQ(rle_array->data()->null_count, 0); + + ASSERT_RAISES_WITH_MESSAGE(Invalid, "Invalid: Run ends array must be int32 type", + RunLengthEncodedArray::Make(string_values, int32_values, 3)); + ASSERT_RAISES_WITH_MESSAGE( + Invalid, "Invalid: Run ends array cannot contain null values", + RunLengthEncodedArray::Make(int32_only_null, int32_values, 3)); +} + +TEST(RunLengthEncodedArray, OffsetLength) { + auto run_ends = ArrayFromJSON(int32(), "[100, 200, 300, 400, 500]"); + auto values = ArrayFromJSON(utf8(), R"(["Hello", "beautiful", "world", "of", "RLE"])"); + ASSERT_OK_AND_ASSIGN(auto rle_array, + RunLengthEncodedArray::Make(run_ends, values, 500)); + + ASSERT_EQ(rle_array->GetPhysicalLength(), 5); + ASSERT_EQ(rle_array->GetPhysicalOffset(), 0); + + auto slice = std::dynamic_pointer_cast<RunLengthEncodedArray>(rle_array->Slice(199, 5)); + ASSERT_EQ(slice->GetPhysicalLength(), 2); + ASSERT_EQ(slice->GetPhysicalOffset(), 1); + + auto slice2 = + std::dynamic_pointer_cast<RunLengthEncodedArray>(rle_array->Slice(199, 101)); + ASSERT_EQ(slice2->GetPhysicalLength(), 2); + ASSERT_EQ(slice2->GetPhysicalOffset(), 1); + Review Comment: I think `slice` and `slice2` expectations are self-explanatory. This one I had to think for a few seconds so I think a comment would be helpful ########## cpp/src/arrow/array/array_encoded_test.cc: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <cstdint> +#include <cstring> +#include <memory> +#include <vector> + +#include "arrow/array.h" +#include "arrow/array/builder_nested.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// Run-length encoded array tests + +namespace { + +auto string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); +auto int32_values = ArrayFromJSON(int32(), "[10, 20, 30]"); +auto int32_only_null = ArrayFromJSON(int32(), "[null, null, null]"); + +TEST(RunLengthEncodedArray, MakeArray) { + ASSERT_OK_AND_ASSIGN(auto rle_array, + RunLengthEncodedArray::Make(int32_values, string_values, 3)); + auto array_data = rle_array->data(); + auto new_array = MakeArray(array_data); + ASSERT_ARRAYS_EQUAL(*new_array, *rle_array); + // should be the exact same ArrayData object + ASSERT_EQ(new_array->data(), array_data); + ASSERT_NE(std::dynamic_pointer_cast<RunLengthEncodedArray>(new_array), NULLPTR); +} + +TEST(RunLengthEncodedArray, FromRunEndsAndValues) { + std::shared_ptr<RunLengthEncodedArray> rle_array; + + ASSERT_OK_AND_ASSIGN(rle_array, + RunLengthEncodedArray::Make(int32_values, int32_values, 3)); + ASSERT_EQ(rle_array->length(), 3); + ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *int32_values); + ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *int32_values); + ASSERT_EQ(rle_array->offset(), 0); + ASSERT_EQ(rle_array->data()->null_count, 0); + // one dummy buffer, since code may assume there is exactly one buffer + ASSERT_EQ(rle_array->data()->buffers.size(), 1); + + // explicitly passing offset + ASSERT_OK_AND_ASSIGN(rle_array, + RunLengthEncodedArray::Make(int32_values, string_values, 2, 1)); + ASSERT_EQ(rle_array->length(), 2); + ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *string_values); + ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *int32_values); + ASSERT_EQ(rle_array->offset(), 1); + // explicitly access null count variable so it is not calculated automatically Review Comment: not sure what `so it is not calculated automatically` means here. ########## cpp/src/arrow/pretty_print.cc: ########## @@ -376,6 +376,10 @@ class ArrayPrinter : public PrettyPrinter { return PrettyPrint(*array.indices(), ChildOptions(true), sink_); } + Status Visit(const RunLengthEncodedArray& array) { + return Status::NotImplemented("printing run-length encoded array"); + } Review Comment: I just wanted to check if this is planned for this PR? ########## cpp/src/arrow/array/array_encoded_test.cc: ########## @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <cstdint> +#include <cstring> +#include <memory> +#include <vector> + +#include "arrow/array.h" +#include "arrow/array/builder_nested.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/testing/builder.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// Run-length encoded array tests + +namespace { + +auto string_values = ArrayFromJSON(utf8(), R"(["Hello", "World", null])"); +auto int32_values = ArrayFromJSON(int32(), "[10, 20, 30]"); +auto int32_only_null = ArrayFromJSON(int32(), "[null, null, null]"); + +TEST(RunLengthEncodedArray, MakeArray) { + ASSERT_OK_AND_ASSIGN(auto rle_array, + RunLengthEncodedArray::Make(int32_values, string_values, 3)); + auto array_data = rle_array->data(); + auto new_array = MakeArray(array_data); + ASSERT_ARRAYS_EQUAL(*new_array, *rle_array); + // should be the exact same ArrayData object + ASSERT_EQ(new_array->data(), array_data); + ASSERT_NE(std::dynamic_pointer_cast<RunLengthEncodedArray>(new_array), NULLPTR); +} + +TEST(RunLengthEncodedArray, FromRunEndsAndValues) { + std::shared_ptr<RunLengthEncodedArray> rle_array; + + ASSERT_OK_AND_ASSIGN(rle_array, + RunLengthEncodedArray::Make(int32_values, int32_values, 3)); + ASSERT_EQ(rle_array->length(), 3); + ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *int32_values); + ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *int32_values); + ASSERT_EQ(rle_array->offset(), 0); + ASSERT_EQ(rle_array->data()->null_count, 0); + // one dummy buffer, since code may assume there is exactly one buffer + ASSERT_EQ(rle_array->data()->buffers.size(), 1); + + // explicitly passing offset + ASSERT_OK_AND_ASSIGN(rle_array, + RunLengthEncodedArray::Make(int32_values, string_values, 2, 1)); + ASSERT_EQ(rle_array->length(), 2); + ASSERT_ARRAYS_EQUAL(*rle_array->values_array(), *string_values); + ASSERT_ARRAYS_EQUAL(*rle_array->run_ends_array(), *int32_values); + ASSERT_EQ(rle_array->offset(), 1); + // explicitly access null count variable so it is not calculated automatically + ASSERT_EQ(rle_array->data()->null_count, 0); + + ASSERT_RAISES_WITH_MESSAGE(Invalid, "Invalid: Run ends array must be int32 type", + RunLengthEncodedArray::Make(string_values, int32_values, 3)); + ASSERT_RAISES_WITH_MESSAGE( + Invalid, "Invalid: Run ends array cannot contain null values", + RunLengthEncodedArray::Make(int32_only_null, int32_values, 3)); +} + +TEST(RunLengthEncodedArray, OffsetLength) { + auto run_ends = ArrayFromJSON(int32(), "[100, 200, 300, 400, 500]"); + auto values = ArrayFromJSON(utf8(), R"(["Hello", "beautiful", "world", "of", "RLE"])"); + ASSERT_OK_AND_ASSIGN(auto rle_array, + RunLengthEncodedArray::Make(run_ends, values, 500)); + + ASSERT_EQ(rle_array->GetPhysicalLength(), 5); + ASSERT_EQ(rle_array->GetPhysicalOffset(), 0); + + auto slice = std::dynamic_pointer_cast<RunLengthEncodedArray>(rle_array->Slice(199, 5)); + ASSERT_EQ(slice->GetPhysicalLength(), 2); + ASSERT_EQ(slice->GetPhysicalOffset(), 1); + + auto slice2 = + std::dynamic_pointer_cast<RunLengthEncodedArray>(rle_array->Slice(199, 101)); + ASSERT_EQ(slice2->GetPhysicalLength(), 2); + ASSERT_EQ(slice2->GetPhysicalOffset(), 1); + Review Comment: ```suggestion // run ends are exclusive; a start offset of 400 only captures the run for "RLE" ``` ########## cpp/src/arrow/compute/kernels/vector_run_length_encode.cc: ########## @@ -0,0 +1,393 @@ +#include "arrow/compute/api_vector.h" +#include "arrow/compute/kernels/common.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/rle_util.h" + +namespace arrow { +namespace compute { +namespace internal { + +template <typename ArrowType, bool has_validity_buffer> +struct EncodeDecodeCommonExec { + using CType = typename ArrowType::c_type; + + struct Element { + bool valid; + CType value; + + bool operator!=(const Element& other) const { + return valid != other.valid || value != other.value; + } + }; + + EncodeDecodeCommonExec(KernelContext* kernel_context, const ExecSpan& span, + ExecResult* result) + : kernel_context{kernel_context}, + input_array{span.values[0].array}, + exec_result{result} { + ARROW_DCHECK(span.num_values() == 1); + } + + Element ReadValue() { + Element result; + if (has_validity_buffer) { + result.valid = bit_util::GetBit(input_validity, read_offset); + } else { + result.valid = true; + } + result.value = (reinterpret_cast<const CType*>(input_values))[read_offset]; + return result; + } + + void WriteValue(Element element) { + if (has_validity_buffer) { + bit_util::SetBitsTo(output_validity, write_offset, 1, element.valid); + } + (reinterpret_cast<CType*>(output_values))[write_offset] = element.value; + } + + KernelContext* kernel_context; + const ArraySpan input_array; + ExecResult* exec_result; + const uint8_t* input_validity; + const void* input_values; + uint8_t* output_validity; + void* output_values; + // read offset is a physical index into the values buffer, including array offsets + int64_t read_offset; + int64_t write_offset; +}; + +template <> +EncodeDecodeCommonExec<BooleanType, true>::Element +EncodeDecodeCommonExec<BooleanType, true>::ReadValue() { + Element result; + result.valid = bit_util::GetBit(input_validity, read_offset); + if (result.valid) { + result.value = + bit_util::GetBit(reinterpret_cast<const uint8_t*>(input_values), read_offset); + } + return result; +} + +template <> +EncodeDecodeCommonExec<BooleanType, false>::Element +EncodeDecodeCommonExec<BooleanType, false>::ReadValue() { + return { + .valid = true, + .value = + bit_util::GetBit(reinterpret_cast<const uint8_t*>(input_values), read_offset), + }; +} + +template <> +void EncodeDecodeCommonExec<BooleanType, true>::WriteValue(Element element) { + bit_util::SetBitTo(output_validity, write_offset, element.valid); + if (element.valid) { + bit_util::SetBitTo(reinterpret_cast<uint8_t*>(output_values), write_offset, + element.value); + } +} + +template <> +void EncodeDecodeCommonExec<BooleanType, false>::WriteValue(Element element) { + bit_util::SetBitTo(reinterpret_cast<uint8_t*>(output_values), write_offset, + element.value); +} + +template <typename ArrowType, bool has_validity_buffer> +struct RunLengthEncodeExec + : public EncodeDecodeCommonExec<ArrowType, has_validity_buffer> { + using EncodeDecodeCommonExec<ArrowType, has_validity_buffer>::EncodeDecodeCommonExec; + using typename EncodeDecodeCommonExec<ArrowType, has_validity_buffer>::CType; + using typename EncodeDecodeCommonExec<ArrowType, has_validity_buffer>::Element; + + Status Exec() { + ArrayData* output_array_data = this->exec_result->array_data().get(); + if (this->input_array.length == 0) { + output_array_data->length = 0; + output_array_data->offset = 0; + output_array_data->buffers = {NULLPTR}; + output_array_data->child_data[0] = + ArrayData::Make(this->input_array.type->GetSharedPtr(), + /*length =*/0, + /*buffers =*/{NULLPTR, NULLPTR}); + return Status::OK(); + } + if (this->input_array.length > std::numeric_limits<int32_t>::max()) { + return Status::Invalid( + "run-length encoded arrays can only have a number of elements that can be " + "represented as a 32-bit signed integer"); Review Comment: ```suggestion return Status::Invalid("Cannot run-length encode Arrays larger than 2^31 elements"); ``` a little bit more concise of an error message -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
