felipecrv commented on code in PR #33641:
URL: https://github.com/apache/arrow/pull/33641#discussion_r1092500261
##########
cpp/src/arrow/array/validate.cc:
##########
@@ -622,6 +637,106 @@ struct ValidateArrayImpl {
return Status::OK();
}
+ template <typename RunEndsType>
+ Status ValidateRunEndEncoded(const RunEndEncodedType& type) {
+ // overflow was already checked at this point
+ if (data.offset + data.length > std::numeric_limits<RunEndsType>::max()) {
+ return Status::Invalid(
+ "Offset + length of an REE array must fit in a value of the run ends
type ",
+ *type.run_ends_type(), ", but offset + length was ", data.offset +
data.length,
+ " while the allowed maximum is ",
std::numeric_limits<RunEndsType>::max());
+ }
+ if (!data.child_data[0]) {
+ return Status::Invalid("Run ends array is null pointer");
+ }
+ if (!data.child_data[1]) {
+ return Status::Invalid("Values array is null pointer");
+ }
+ const ArrayData& run_ends_data = *data.child_data[0];
+ const ArrayData& values_data = *data.child_data[1];
+ if (*run_ends_data.type != *type.run_ends_type()) {
+ return Status::Invalid("Run ends array of ", type, " must be ",
+ *type.run_ends_type(), ", but is ",
*run_ends_data.type);
+ }
+ if (values_data.type != type.encoded_type()) {
+ return Status::Invalid("Parent type says this array encodes ",
*type.encoded_type(),
+ " values, but values array has type ",
*values_data.type);
+ }
+ const Status run_ends_valid = RecurseInto(run_ends_data);
+ if (!run_ends_valid.ok()) {
+ return Status::Invalid("Run ends array invalid: ",
run_ends_valid.ToString());
+ }
+ const Status values_valid = RecurseInto(values_data);
+ if (!values_valid.ok()) {
+ return Status::Invalid("Values array invalid: ",
values_valid.ToString());
+ }
+ if (data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for REE array, but was ",
+ data.null_count);
+ }
+ if (run_ends_data.null_count != 0) {
+ return Status::Invalid("Null count must be 0 for run ends array, but was
",
+ run_ends_data.null_count);
+ }
+ if (!run_ends_data.buffers[1]->is_cpu()) {
+ return Status::NotImplemented("Validating non-CPU run ends buffers");
+ }
+ ArraySpan span(data);
+ const RunEndsType* run_ends = ree_util::RunEnds<RunEndsType>(span);
+ if (run_ends_data.length == 0) {
+ if (data.length == 0) {
+ return Status::OK();
+ } else {
+ return Status::Invalid("REE array has non-zero length ", data.length,
+ ", but run ends array has zero length");
+ }
+ }
+ if (run_ends[run_ends_data.length - 1] < data.offset + data.length) {
+ return Status::Invalid(
+ "Last run in run ends array ends at ", run_ends[run_ends_data.length
- 1],
+ " but this array requires at least ", data.offset + data.length, "
(offset ",
+ data.offset, ", length ", data.length, ")");
+ }
Review Comment:
`data.offset` and `data.length` are the logical `offset` and `length`. The
physical `offset` into the `run_ends` is `run_ends.offset` + the offset of the
first `run_end` that is `> data.offset` (this is determined by binary-search).

--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]