pitrou commented on code in PR #33641:
URL: https://github.com/apache/arrow/pull/33641#discussion_r1073512429
##########
cpp/src/arrow/compute/kernels/vector_run_end_encode_test.cc:
##########
@@ -0,0 +1,196 @@
+// Licensed to the Apache Software Foundation (ASF) under one
Review Comment:
Are the additional kernels (encode/decode, filter) required for this PR?
Otherwise, it would be better to file them as separate PRs.
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
+ *type.run_ends_type(), ", but offset + length was ", data.offset +
data.length,
+ " while the allowed maximum is ",
std::numeric_limits<RunEndsType>::max());
+ }
+ if (!data.child_data[0]) {
+ return Status::Invalid("Run ends array is null pointer");
+ }
+ if (!data.child_data[1]) {
+ return Status::Invalid("Values array is null pointer");
+ }
+ const ArrayData& run_ends_data = *data.child_data[0];
+ const ArrayData& values_data = *data.child_data[1];
+ if (*run_ends_data.type != *type.run_ends_type()) {
+ return Status::Invalid("Run ends array of ", type, " must be ",
+ *type.run_ends_type(), ", but is ",
*run_ends_data.type);
+ }
+ if (values_data.type != type.encoded_type()) {
+ return Status::Invalid("Parent type says this array encodes ",
*type.encoded_type(),
+ " values, but values array has type ",
*values_data.type);
+ }
+ const Status run_ends_valid = RecurseInto(run_ends_data);
+ if (!run_ends_valid.ok()) {
+ return Status::Invalid("Run ends array invalid: ",
run_ends_valid.ToString());
+ }
+ const Status values_valid = RecurseInto(values_data);
+ if (!values_valid.ok()) {
+ return Status::Invalid("Values array invalid: ",
values_valid.ToString());
+ }
+ if (data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for REE array, but was ",
+ data.null_count);
+ }
+ if (run_ends_data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for run ends array, but was
",
+ run_ends_data.null_count);
+ }
+ if (!run_ends_data.buffers[1]->is_cpu()) {
+ return Status::NotImplemented("Validating non-CPU run ends buffers");
Review Comment:
We certainly shouldn't fail here (see similar code paths).
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
+ *type.run_ends_type(), ", but offset + length was ", data.offset +
data.length,
+ " while the allowed maximum is ",
std::numeric_limits<RunEndsType>::max());
+ }
+ if (!data.child_data[0]) {
+ return Status::Invalid("Run ends array is null pointer");
+ }
+ if (!data.child_data[1]) {
+ return Status::Invalid("Values array is null pointer");
+ }
+ const ArrayData& run_ends_data = *data.child_data[0];
+ const ArrayData& values_data = *data.child_data[1];
+ if (*run_ends_data.type != *type.run_ends_type()) {
+ return Status::Invalid("Run ends array of ", type, " must be ",
+ *type.run_ends_type(), ", but is ",
*run_ends_data.type);
+ }
+ if (values_data.type != type.encoded_type()) {
+ return Status::Invalid("Parent type says this array encodes ",
*type.encoded_type(),
+ " values, but values array has type ",
*values_data.type);
+ }
+ const Status run_ends_valid = RecurseInto(run_ends_data);
+ if (!run_ends_valid.ok()) {
+ return Status::Invalid("Run ends array invalid: ",
run_ends_valid.ToString());
+ }
+ const Status values_valid = RecurseInto(values_data);
+ if (!values_valid.ok()) {
+ return Status::Invalid("Values array invalid: ",
values_valid.ToString());
+ }
+ if (data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for REE array, but was ",
+ data.null_count);
+ }
+ if (run_ends_data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for run ends array, but was
",
+ run_ends_data.null_count);
+ }
+ if (!run_ends_data.buffers[1]->is_cpu()) {
+ return Status::NotImplemented("Validating non-CPU run ends buffers");
+ }
+ ArraySpan span(data);
+ const RunEndsType* run_ends = ree_util::RunEnds<RunEndsType>(span);
+ if (run_ends_data.length == 0) {
+ if (data.length == 0) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("REE array has non-zero length ", data.length,
+ ", but run ends array has zero length");
+ }
+ }
+ if (run_ends[run_ends_data.length - 1] < data.offset + data.length) {
+ return Status::Invalid(
+ "Last run in run ends array ends at ", run_ends[run_ends_data.length
- 1],
+ " but this array requires at least ", data.offset + data.length, "
(offset ",
+ data.offset, ", length ", data.length, ")");
+ }
Review Comment:
I don't understand why this is needed, can you explain?
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
+ *type.run_ends_type(), ", but offset + length was ", data.offset +
data.length,
+ " while the allowed maximum is ",
std::numeric_limits<RunEndsType>::max());
+ }
+ if (!data.child_data[0]) {
+ return Status::Invalid("Run ends array is null pointer");
+ }
+ if (!data.child_data[1]) {
+ return Status::Invalid("Values array is null pointer");
+ }
+ const ArrayData& run_ends_data = *data.child_data[0];
+ const ArrayData& values_data = *data.child_data[1];
+ if (*run_ends_data.type != *type.run_ends_type()) {
+ return Status::Invalid("Run ends array of ", type, " must be ",
+ *type.run_ends_type(), ", but is ",
*run_ends_data.type);
+ }
+ if (values_data.type != type.encoded_type()) {
+ return Status::Invalid("Parent type says this array encodes ",
*type.encoded_type(),
+ " values, but values array has type ",
*values_data.type);
+ }
+ const Status run_ends_valid = RecurseInto(run_ends_data);
+ if (!run_ends_valid.ok()) {
+ return Status::Invalid("Run ends array invalid: ",
run_ends_valid.ToString());
+ }
+ const Status values_valid = RecurseInto(values_data);
+ if (!values_valid.ok()) {
+ return Status::Invalid("Values array invalid: ",
values_valid.ToString());
+ }
+ if (data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for REE array, but was ",
+ data.null_count);
+ }
+ if (run_ends_data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for run ends array, but was
",
+ run_ends_data.null_count);
+ }
+ if (!run_ends_data.buffers[1]->is_cpu()) {
+ return Status::NotImplemented("Validating non-CPU run ends buffers");
+ }
+ ArraySpan span(data);
+ const RunEndsType* run_ends = ree_util::RunEnds<RunEndsType>(span);
+ if (run_ends_data.length == 0) {
+ if (data.length == 0) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("REE array has non-zero length ", data.length,
Review Comment:
```suggestion
return Status::Invalid("Run-end-encoded array has non-zero length ",
data.length,
```
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
+ *type.run_ends_type(), ", but offset + length was ", data.offset +
data.length,
+ " while the allowed maximum is ",
std::numeric_limits<RunEndsType>::max());
+ }
+ if (!data.child_data[0]) {
+ return Status::Invalid("Run ends array is null pointer");
+ }
+ if (!data.child_data[1]) {
+ return Status::Invalid("Values array is null pointer");
+ }
+ const ArrayData& run_ends_data = *data.child_data[0];
+ const ArrayData& values_data = *data.child_data[1];
+ if (*run_ends_data.type != *type.run_ends_type()) {
+ return Status::Invalid("Run ends array of ", type, " must be ",
+ *type.run_ends_type(), ", but is ",
*run_ends_data.type);
+ }
+ if (values_data.type != type.encoded_type()) {
+ return Status::Invalid("Parent type says this array encodes ",
*type.encoded_type(),
+ " values, but values array has type ",
*values_data.type);
+ }
+ const Status run_ends_valid = RecurseInto(run_ends_data);
+ if (!run_ends_valid.ok()) {
+ return Status::Invalid("Run ends array invalid: ",
run_ends_valid.ToString());
+ }
+ const Status values_valid = RecurseInto(values_data);
+ if (!values_valid.ok()) {
+ return Status::Invalid("Values array invalid: ",
values_valid.ToString());
+ }
+ if (data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for REE array, but was ",
+ data.null_count);
+ }
+ if (run_ends_data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for run ends array, but was
",
+ run_ends_data.null_count);
+ }
+ if (!run_ends_data.buffers[1]->is_cpu()) {
+ return Status::NotImplemented("Validating non-CPU run ends buffers");
+ }
+ ArraySpan span(data);
+ const RunEndsType* run_ends = ree_util::RunEnds<RunEndsType>(span);
+ if (run_ends_data.length == 0) {
+ if (data.length == 0) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("REE array has non-zero length ", data.length,
+ ", but run ends array has zero length");
+ }
+ }
+ if (run_ends[run_ends_data.length - 1] < data.offset + data.length) {
+ return Status::Invalid(
+ "Last run in run ends array ends at ", run_ends[run_ends_data.length
- 1],
+ " but this array requires at least ", data.offset + data.length, "
(offset ",
+ data.offset, ", length ", data.length, ")");
+ }
+ if (full_validation && run_ends_data.length != 0) {
Review Comment:
`run_ends_data.length == 0` is already excluded above
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
+ *type.run_ends_type(), ", but offset + length was ", data.offset +
data.length,
+ " while the allowed maximum is ",
std::numeric_limits<RunEndsType>::max());
+ }
+ if (!data.child_data[0]) {
+ return Status::Invalid("Run ends array is null pointer");
+ }
+ if (!data.child_data[1]) {
+ return Status::Invalid("Values array is null pointer");
+ }
+ const ArrayData& run_ends_data = *data.child_data[0];
+ const ArrayData& values_data = *data.child_data[1];
+ if (*run_ends_data.type != *type.run_ends_type()) {
+ return Status::Invalid("Run ends array of ", type, " must be ",
+ *type.run_ends_type(), ", but is ",
*run_ends_data.type);
+ }
+ if (values_data.type != type.encoded_type()) {
+ return Status::Invalid("Parent type says this array encodes ",
*type.encoded_type(),
+ " values, but values array has type ",
*values_data.type);
+ }
+ const Status run_ends_valid = RecurseInto(run_ends_data);
+ if (!run_ends_valid.ok()) {
+ return Status::Invalid("Run ends array invalid: ",
run_ends_valid.ToString());
+ }
+ const Status values_valid = RecurseInto(values_data);
+ if (!values_valid.ok()) {
+ return Status::Invalid("Values array invalid: ",
values_valid.ToString());
+ }
+ if (data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for REE array, but was ",
+ data.null_count);
+ }
+ if (run_ends_data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for run ends array, but was
",
+ run_ends_data.null_count);
+ }
+ if (!run_ends_data.buffers[1]->is_cpu()) {
+ return Status::NotImplemented("Validating non-CPU run ends buffers");
+ }
+ ArraySpan span(data);
+ const RunEndsType* run_ends = ree_util::RunEnds<RunEndsType>(span);
+ if (run_ends_data.length == 0) {
+ if (data.length == 0) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("REE array has non-zero length ", data.length,
+ ", but run ends array has zero length");
+ }
+ }
+ if (run_ends[run_ends_data.length - 1] < data.offset + data.length) {
+ return Status::Invalid(
+ "Last run in run ends array ends at ", run_ends[run_ends_data.length
- 1],
+ " but this array requires at least ", data.offset + data.length, "
(offset ",
+ data.offset, ", length ", data.length, ")");
+ }
+ if (full_validation && run_ends_data.length != 0) {
+ const int64_t run_ends_length = ree_util::RunEndsArray(span).length;
+ int64_t last_run_end = 0;
+ int64_t physical_offset = 0;
+ int64_t physical_end = 0;
+ for (int64_t index = 0; index < run_ends_length; index++) {
+ int64_t run_end = run_ends[index];
+ if (run_end < 1) {
+ return Status::Invalid(
+ "Run ends array invalid: All run ends must be a positive integer
but run "
+ "end ",
+ index, " is ", run_end);
+ }
+ if (run_end <= last_run_end) {
+ return Status::Invalid(
+ "Run ends array invalid: Each run end must be greater than the
prevous "
+ "one, but run end ",
+ index, " is ", run_end, " and run end ", index - 1, " is ",
last_run_end);
+ }
+ if (run_end > data.offset && last_run_end <= data.offset) {
+ physical_offset = index;
+ }
+ if (run_end >= data.offset + data.length &&
+ last_run_end < data.offset + data.length) {
+ physical_end = index + 1;
+ }
+ last_run_end = run_end;
+ }
+ ARROW_DCHECK_EQ(physical_offset, ree_util::FindPhysicalOffset(span));
+ ARROW_DCHECK_EQ(physical_end, physical_offset +
ree_util::FindPhysicalLength(span));
+ if (values_data.length < physical_end) {
+ return Status::Invalid(
+ "Values array needs at least ", physical_end,
+ " elements to hold the runs described by the run ends array, but
only has ",
+ values_data.length);
+ }
Review Comment:
This should not be needed in this loop. We're checking here that the run
ends are monotonically increasing. We can check out of the loop that the last
run end doesn't exceed the child array length.
##########
cpp/src/arrow/array/array_encoded.cc:
##########
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/array/array_encoded.h"
+#include "arrow/array/util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/ree_util.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// RunEndEncodedArray
+
+RunEndEncodedArray::RunEndEncodedArray(const std::shared_ptr<ArrayData>& data)
{
+ ARROW_CHECK_EQ(data->type->id(), Type::RUN_END_ENCODED);
+ SetData(data);
+}
+
+RunEndEncodedArray::RunEndEncodedArray(const std::shared_ptr<DataType>& type,
+ int64_t logical_length,
+ const std::shared_ptr<Array>& run_ends,
+ const std::shared_ptr<Array>& values,
+ int64_t offset) {
+ ARROW_CHECK_EQ(type->id(), Type::RUN_END_ENCODED);
+ SetData(ArrayData::Make(type, logical_length,
+ /*buffers=*/{NULLPTR},
+ /*child_data=*/{run_ends->data(), values->data()},
0, offset));
+}
+
+Result<std::shared_ptr<RunEndEncodedArray>> RunEndEncodedArray::Make(
+ const std::shared_ptr<Array>& run_ends, const std::shared_ptr<Array>&
values,
+ int64_t logical_length, int64_t offset) {
+ auto run_ends_type = run_ends->type();
+ auto values_type = values->type();
+ if (!RunEndEncodedType::RunEndsTypeValid(*run_ends_type)) {
+ return Status::Invalid("Run ends array must be int16, int32 or int64
type");
+ }
+ if (run_ends->null_count() != 0) {
+ return Status::Invalid("Run ends array cannot contain null values");
+ }
+
+ return std::make_shared<RunEndEncodedArray>(
+ run_end_encoded(std::move(run_ends_type), std::move(values_type)),
logical_length,
+ run_ends, values, offset);
+}
+
+std::shared_ptr<Array> RunEndEncodedArray::run_ends_array() const {
+ return MakeArray(data()->child_data[0]);
+}
+
+std::shared_ptr<Array> RunEndEncodedArray::values_array() const {
+ return MakeArray(data()->child_data[1]);
+}
Review Comment:
These could be cached on the instance instead of calling `MakeArray` every
time.
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
Review Comment:
```suggestion
"Offset + length of a run-end-encoded array must fit in a value of
the run ends type ",
```
##########
cpp/src/arrow/array/array_encoded.h:
##########
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Array accessor classes run-end encoded arrays
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \addtogroup encoded-arrays
+///
+/// @{
+
+// ----------------------------------------------------------------------
+// RunEndEncoded
+
+/// \brief Array type for run-end encoded data
+class ARROW_EXPORT RunEndEncodedArray : public Array {
+ public:
+ using TypeClass = RunEndEncodedType;
+
+ explicit RunEndEncodedArray(const std::shared_ptr<ArrayData>& data);
+
+ RunEndEncodedArray(const std::shared_ptr<DataType>& type, int64_t
logical_length,
+ const std::shared_ptr<Array>& run_ends,
+ const std::shared_ptr<Array>& values, int64_t offset = 0);
+
+ /// \brief Construct a RunEndEncodedArray from values and run ends arrays
+ ///
+ /// The data type is automatically inferred from the arguments.
+ /// The run_ends and values arrays must have the same length.
+ static Result<std::shared_ptr<RunEndEncodedArray>> Make(
+ const std::shared_ptr<Array>& run_ends, const std::shared_ptr<Array>&
values,
+ int64_t logical_length, int64_t offset = 0);
+
+ /// \brief Returns an array holding the logical indexes of each run-end
+ ///
+ /// The physical offset to the array is applied.
+ std::shared_ptr<Array> run_ends_array() const;
+
+ /// \brief Returns an array holding the values of each run
+ ///
+ /// The physical offset to the array is applied.
+ std::shared_ptr<Array> values_array() const;
+
+ /// \brief Find the physical offset of the REE array
Review Comment:
I don't understand what this means. What is "the physical offset"? Is it
different from the array `offset`?
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
+ *type.run_ends_type(), ", but offset + length was ", data.offset +
data.length,
+ " while the allowed maximum is ",
std::numeric_limits<RunEndsType>::max());
+ }
+ if (!data.child_data[0]) {
+ return Status::Invalid("Run ends array is null pointer");
+ }
+ if (!data.child_data[1]) {
+ return Status::Invalid("Values array is null pointer");
+ }
+ const ArrayData& run_ends_data = *data.child_data[0];
+ const ArrayData& values_data = *data.child_data[1];
+ if (*run_ends_data.type != *type.run_ends_type()) {
+ return Status::Invalid("Run ends array of ", type, " must be ",
+ *type.run_ends_type(), ", but is ",
*run_ends_data.type);
+ }
+ if (values_data.type != type.encoded_type()) {
+ return Status::Invalid("Parent type says this array encodes ",
*type.encoded_type(),
+ " values, but values array has type ",
*values_data.type);
+ }
+ const Status run_ends_valid = RecurseInto(run_ends_data);
+ if (!run_ends_valid.ok()) {
+ return Status::Invalid("Run ends array invalid: ",
run_ends_valid.ToString());
+ }
+ const Status values_valid = RecurseInto(values_data);
+ if (!values_valid.ok()) {
+ return Status::Invalid("Values array invalid: ",
values_valid.ToString());
+ }
+ if (data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for REE array, but was ",
+ data.null_count);
+ }
+ if (run_ends_data.null_count != 0) {
Review Comment:
I would call `run_ends_data.GetNullCount` here, because the null count might
be set to -1 (meaning not yet computed).
##########
cpp/src/arrow/array/array_encoded.h:
##########
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Array accessor classes run-end encoded arrays
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \addtogroup encoded-arrays
+///
+/// @{
+
+// ----------------------------------------------------------------------
+// RunEndEncoded
+
+/// \brief Array type for run-end encoded data
+class ARROW_EXPORT RunEndEncodedArray : public Array {
+ public:
+ using TypeClass = RunEndEncodedType;
+
+ explicit RunEndEncodedArray(const std::shared_ptr<ArrayData>& data);
+
+ RunEndEncodedArray(const std::shared_ptr<DataType>& type, int64_t
logical_length,
Review Comment:
What is the "logical length"? The number of runs, or the accumulated run
length? Please describe parameters in docstrings, when they are non-obvious.
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
+ *type.run_ends_type(), ", but offset + length was ", data.offset +
data.length,
+ " while the allowed maximum is ",
std::numeric_limits<RunEndsType>::max());
+ }
+ if (!data.child_data[0]) {
+ return Status::Invalid("Run ends array is null pointer");
+ }
+ if (!data.child_data[1]) {
+ return Status::Invalid("Values array is null pointer");
+ }
+ const ArrayData& run_ends_data = *data.child_data[0];
+ const ArrayData& values_data = *data.child_data[1];
+ if (*run_ends_data.type != *type.run_ends_type()) {
+ return Status::Invalid("Run ends array of ", type, " must be ",
+ *type.run_ends_type(), ", but is ",
*run_ends_data.type);
+ }
+ if (values_data.type != type.encoded_type()) {
+ return Status::Invalid("Parent type says this array encodes ",
*type.encoded_type(),
+ " values, but values array has type ",
*values_data.type);
+ }
+ const Status run_ends_valid = RecurseInto(run_ends_data);
+ if (!run_ends_valid.ok()) {
+ return Status::Invalid("Run ends array invalid: ",
run_ends_valid.ToString());
+ }
+ const Status values_valid = RecurseInto(values_data);
+ if (!values_valid.ok()) {
+ return Status::Invalid("Values array invalid: ",
values_valid.ToString());
+ }
+ if (data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for REE array, but was ",
+ data.null_count);
+ }
+ if (run_ends_data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for run ends array, but was
",
+ run_ends_data.null_count);
+ }
+ if (!run_ends_data.buffers[1]->is_cpu()) {
+ return Status::NotImplemented("Validating non-CPU run ends buffers");
+ }
+ ArraySpan span(data);
+ const RunEndsType* run_ends = ree_util::RunEnds<RunEndsType>(span);
+ if (run_ends_data.length == 0) {
+ if (data.length == 0) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("REE array has non-zero length ", data.length,
+ ", but run ends array has zero length");
+ }
+ }
+ if (run_ends[run_ends_data.length - 1] < data.offset + data.length) {
+ return Status::Invalid(
+ "Last run in run ends array ends at ", run_ends[run_ends_data.length
- 1],
+ " but this array requires at least ", data.offset + data.length, "
(offset ",
+ data.offset, ", length ", data.length, ")");
+ }
Review Comment:
What should be clarified is how the `offset` of a run-end-encoded array
works: is it a logical offset or an offset into the run ends?
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
+ *type.run_ends_type(), ", but offset + length was ", data.offset +
data.length,
+ " while the allowed maximum is ",
std::numeric_limits<RunEndsType>::max());
+ }
+ if (!data.child_data[0]) {
+ return Status::Invalid("Run ends array is null pointer");
+ }
+ if (!data.child_data[1]) {
+ return Status::Invalid("Values array is null pointer");
+ }
+ const ArrayData& run_ends_data = *data.child_data[0];
+ const ArrayData& values_data = *data.child_data[1];
+ if (*run_ends_data.type != *type.run_ends_type()) {
+ return Status::Invalid("Run ends array of ", type, " must be ",
+ *type.run_ends_type(), ", but is ",
*run_ends_data.type);
+ }
+ if (values_data.type != type.encoded_type()) {
Review Comment:
Are you comparing pointer values here?
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
+ *type.run_ends_type(), ", but offset + length was ", data.offset +
data.length,
+ " while the allowed maximum is ",
std::numeric_limits<RunEndsType>::max());
+ }
+ if (!data.child_data[0]) {
+ return Status::Invalid("Run ends array is null pointer");
+ }
+ if (!data.child_data[1]) {
+ return Status::Invalid("Values array is null pointer");
+ }
+ const ArrayData& run_ends_data = *data.child_data[0];
+ const ArrayData& values_data = *data.child_data[1];
+ if (*run_ends_data.type != *type.run_ends_type()) {
+ return Status::Invalid("Run ends array of ", type, " must be ",
+ *type.run_ends_type(), ", but is ",
*run_ends_data.type);
+ }
+ if (values_data.type != type.encoded_type()) {
+ return Status::Invalid("Parent type says this array encodes ",
*type.encoded_type(),
+ " values, but values array has type ",
*values_data.type);
+ }
+ const Status run_ends_valid = RecurseInto(run_ends_data);
+ if (!run_ends_valid.ok()) {
+ return Status::Invalid("Run ends array invalid: ",
run_ends_valid.ToString());
+ }
+ const Status values_valid = RecurseInto(values_data);
+ if (!values_valid.ok()) {
+ return Status::Invalid("Values array invalid: ",
values_valid.ToString());
+ }
+ if (data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for REE array, but was ",
Review Comment:
```suggestion
return Status::Invalid("Null count must be 0 for run-end-encoded
array, but was ",
```
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
+ *type.run_ends_type(), ", but offset + length was ", data.offset +
data.length,
+ " while the allowed maximum is ",
std::numeric_limits<RunEndsType>::max());
+ }
+ if (!data.child_data[0]) {
+ return Status::Invalid("Run ends array is null pointer");
+ }
+ if (!data.child_data[1]) {
+ return Status::Invalid("Values array is null pointer");
+ }
+ const ArrayData& run_ends_data = *data.child_data[0];
+ const ArrayData& values_data = *data.child_data[1];
+ if (*run_ends_data.type != *type.run_ends_type()) {
+ return Status::Invalid("Run ends array of ", type, " must be ",
+ *type.run_ends_type(), ", but is ",
*run_ends_data.type);
+ }
+ if (values_data.type != type.encoded_type()) {
+ return Status::Invalid("Parent type says this array encodes ",
*type.encoded_type(),
+ " values, but values array has type ",
*values_data.type);
+ }
+ const Status run_ends_valid = RecurseInto(run_ends_data);
+ if (!run_ends_valid.ok()) {
+ return Status::Invalid("Run ends array invalid: ",
run_ends_valid.ToString());
+ }
+ const Status values_valid = RecurseInto(values_data);
+ if (!values_valid.ok()) {
+ return Status::Invalid("Values array invalid: ",
values_valid.ToString());
+ }
+ if (data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for REE array, but was ",
+ data.null_count);
+ }
+ if (run_ends_data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for run ends array, but was
",
+ run_ends_data.null_count);
+ }
+ if (!run_ends_data.buffers[1]->is_cpu()) {
+ return Status::NotImplemented("Validating non-CPU run ends buffers");
+ }
+ ArraySpan span(data);
+ const RunEndsType* run_ends = ree_util::RunEnds<RunEndsType>(span);
+ if (run_ends_data.length == 0) {
+ if (data.length == 0) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("REE array has non-zero length ", data.length,
+ ", but run ends array has zero length");
+ }
+ }
+ if (run_ends[run_ends_data.length - 1] < data.offset + data.length) {
+ return Status::Invalid(
+ "Last run in run ends array ends at ", run_ends[run_ends_data.length
- 1],
+ " but this array requires at least ", data.offset + data.length, "
(offset ",
+ data.offset, ", length ", data.length, ")");
+ }
+ if (full_validation && run_ends_data.length != 0) {
+ const int64_t run_ends_length = ree_util::RunEndsArray(span).length;
+ int64_t last_run_end = 0;
+ int64_t physical_offset = 0;
+ int64_t physical_end = 0;
+ for (int64_t index = 0; index < run_ends_length; index++) {
+ int64_t run_end = run_ends[index];
+ if (run_end < 1) {
+ return Status::Invalid(
+ "Run ends array invalid: All run ends must be a positive integer
but run "
+ "end ",
+ index, " is ", run_end);
+ }
+ if (run_end <= last_run_end) {
+ return Status::Invalid(
+ "Run ends array invalid: Each run end must be greater than the
prevous "
+ "one, but run end ",
+ index, " is ", run_end, " and run end ", index - 1, " is ",
last_run_end);
+ }
+ if (run_end > data.offset && last_run_end <= data.offset) {
+ physical_offset = index;
+ }
+ if (run_end >= data.offset + data.length &&
+ last_run_end < data.offset + data.length) {
+ physical_end = index + 1;
+ }
Review Comment:
These conditions seem weird. You should add explanatory comments.
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
+ *type.run_ends_type(), ", but offset + length was ", data.offset +
data.length,
+ " while the allowed maximum is ",
std::numeric_limits<RunEndsType>::max());
+ }
+ if (!data.child_data[0]) {
+ return Status::Invalid("Run ends array is null pointer");
+ }
+ if (!data.child_data[1]) {
+ return Status::Invalid("Values array is null pointer");
+ }
+ const ArrayData& run_ends_data = *data.child_data[0];
+ const ArrayData& values_data = *data.child_data[1];
+ if (*run_ends_data.type != *type.run_ends_type()) {
+ return Status::Invalid("Run ends array of ", type, " must be ",
+ *type.run_ends_type(), ", but is ",
*run_ends_data.type);
+ }
+ if (values_data.type != type.encoded_type()) {
+ return Status::Invalid("Parent type says this array encodes ",
*type.encoded_type(),
+ " values, but values array has type ",
*values_data.type);
+ }
+ const Status run_ends_valid = RecurseInto(run_ends_data);
+ if (!run_ends_valid.ok()) {
+ return Status::Invalid("Run ends array invalid: ",
run_ends_valid.ToString());
+ }
+ const Status values_valid = RecurseInto(values_data);
+ if (!values_valid.ok()) {
+ return Status::Invalid("Values array invalid: ",
values_valid.ToString());
+ }
+ if (data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for REE array, but was ",
+ data.null_count);
+ }
+ if (run_ends_data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for run ends array, but was
",
+ run_ends_data.null_count);
+ }
+ if (!run_ends_data.buffers[1]->is_cpu()) {
+ return Status::NotImplemented("Validating non-CPU run ends buffers");
+ }
+ ArraySpan span(data);
+ const RunEndsType* run_ends = ree_util::RunEnds<RunEndsType>(span);
+ if (run_ends_data.length == 0) {
+ if (data.length == 0) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("REE array has non-zero length ", data.length,
+ ", but run ends array has zero length");
+ }
+ }
+ if (run_ends[run_ends_data.length - 1] < data.offset + data.length) {
+ return Status::Invalid(
+ "Last run in run ends array ends at ", run_ends[run_ends_data.length
- 1],
+ " but this array requires at least ", data.offset + data.length, "
(offset ",
+ data.offset, ", length ", data.length, ")");
+ }
+ if (full_validation && run_ends_data.length != 0) {
+ const int64_t run_ends_length = ree_util::RunEndsArray(span).length;
+ int64_t last_run_end = 0;
+ int64_t physical_offset = 0;
+ int64_t physical_end = 0;
+ for (int64_t index = 0; index < run_ends_length; index++) {
+ int64_t run_end = run_ends[index];
+ if (run_end < 1) {
+ return Status::Invalid(
+ "Run ends array invalid: All run ends must be a positive integer
but run "
+ "end ",
+ index, " is ", run_end);
+ }
+ if (run_end <= last_run_end) {
+ return Status::Invalid(
+ "Run ends array invalid: Each run end must be greater than the
prevous "
+ "one, but run end ",
+ index, " is ", run_end, " and run end ", index - 1, " is ",
last_run_end);
+ }
+ if (run_end > data.offset && last_run_end <= data.offset) {
+ physical_offset = index;
+ }
+ if (run_end >= data.offset + data.length &&
+ last_run_end < data.offset + data.length) {
+ physical_end = index + 1;
+ }
+ last_run_end = run_end;
+ }
+ ARROW_DCHECK_EQ(physical_offset, ree_util::FindPhysicalOffset(span));
+ ARROW_DCHECK_EQ(physical_end, physical_offset +
ree_util::FindPhysicalLength(span));
Review Comment:
That's gonna be quite costly at runtime. Are you sure you need this?
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
+ *type.run_ends_type(), ", but offset + length was ", data.offset +
data.length,
+ " while the allowed maximum is ",
std::numeric_limits<RunEndsType>::max());
+ }
+ if (!data.child_data[0]) {
+ return Status::Invalid("Run ends array is null pointer");
+ }
+ if (!data.child_data[1]) {
+ return Status::Invalid("Values array is null pointer");
+ }
+ const ArrayData& run_ends_data = *data.child_data[0];
+ const ArrayData& values_data = *data.child_data[1];
+ if (*run_ends_data.type != *type.run_ends_type()) {
+ return Status::Invalid("Run ends array of ", type, " must be ",
+ *type.run_ends_type(), ", but is ",
*run_ends_data.type);
+ }
+ if (values_data.type != type.encoded_type()) {
+ return Status::Invalid("Parent type says this array encodes ",
*type.encoded_type(),
+ " values, but values array has type ",
*values_data.type);
+ }
+ const Status run_ends_valid = RecurseInto(run_ends_data);
+ if (!run_ends_valid.ok()) {
+ return Status::Invalid("Run ends array invalid: ",
run_ends_valid.ToString());
+ }
+ const Status values_valid = RecurseInto(values_data);
+ if (!values_valid.ok()) {
+ return Status::Invalid("Values array invalid: ",
values_valid.ToString());
+ }
+ if (data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for REE array, but was ",
+ data.null_count);
+ }
+ if (run_ends_data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for run ends array, but was
",
+ run_ends_data.null_count);
+ }
+ if (!run_ends_data.buffers[1]->is_cpu()) {
+ return Status::NotImplemented("Validating non-CPU run ends buffers");
+ }
+ ArraySpan span(data);
+ const RunEndsType* run_ends = ree_util::RunEnds<RunEndsType>(span);
+ if (run_ends_data.length == 0) {
+ if (data.length == 0) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("REE array has non-zero length ", data.length,
+ ", but run ends array has zero length");
+ }
+ }
+ if (run_ends[run_ends_data.length - 1] < data.offset + data.length) {
+ return Status::Invalid(
+ "Last run in run ends array ends at ", run_ends[run_ends_data.length
- 1],
+ " but this array requires at least ", data.offset + data.length, "
(offset ",
+ data.offset, ", length ", data.length, ")");
+ }
+ if (full_validation && run_ends_data.length != 0) {
+ const int64_t run_ends_length = ree_util::RunEndsArray(span).length;
Review Comment:
Can we consolidate this with `RunEnds<RunEndsType>` above?
##########
cpp/src/arrow/util/ree_util.h:
##########
@@ -0,0 +1,280 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/array/data.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/logging.h"
Review Comment:
We shouldn't include `logging.h` in public headers.
##########
cpp/src/arrow/array/array_encoded.h:
##########
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Array accessor classes run-end encoded arrays
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \addtogroup encoded-arrays
+///
+/// @{
+
+// ----------------------------------------------------------------------
+// RunEndEncoded
+
+/// \brief Array type for run-end encoded data
+class ARROW_EXPORT RunEndEncodedArray : public Array {
+ public:
+ using TypeClass = RunEndEncodedType;
+
+ explicit RunEndEncodedArray(const std::shared_ptr<ArrayData>& data);
+
+ RunEndEncodedArray(const std::shared_ptr<DataType>& type, int64_t
logical_length,
+ const std::shared_ptr<Array>& run_ends,
+ const std::shared_ptr<Array>& values, int64_t offset = 0);
+
+ /// \brief Construct a RunEndEncodedArray from values and run ends arrays
+ ///
+ /// The data type is automatically inferred from the arguments.
+ /// The run_ends and values arrays must have the same length.
+ static Result<std::shared_ptr<RunEndEncodedArray>> Make(
+ const std::shared_ptr<Array>& run_ends, const std::shared_ptr<Array>&
values,
+ int64_t logical_length, int64_t offset = 0);
+
+ /// \brief Returns an array holding the logical indexes of each run-end
+ ///
+ /// The physical offset to the array is applied.
+ std::shared_ptr<Array> run_ends_array() const;
+
+ /// \brief Returns an array holding the values of each run
+ ///
+ /// The physical offset to the array is applied.
+ std::shared_ptr<Array> values_array() const;
+
+ /// \brief Find the physical offset of the REE array
+ ///
+ /// This function uses binary-search, so it has a O(log N) cost.
+ int64_t FindPhysicalOffset() const;
+
+ /// \brief Find the physical length of the REE array
+ ///
+ /// Avoid calling this function if the physical length can be estabilished in
+ /// some other way. This function uses binary-search, so it has a O(log N)
+ /// cost.
Review Comment:
What does it have a O(log N) cost? What do you call "physical length"? The
number of runs? Something else?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]