bkietz commented on a change in pull request #8652:
URL: https://github.com/apache/arrow/pull/8652#discussion_r522302828
##########
File path: cpp/src/arrow/array/validate.cc
##########
@@ -38,195 +39,172 @@ namespace internal {
namespace {
-struct ValidateArrayVisitor {
- Status Visit(const NullArray& array) {
- ARROW_RETURN_IF(array.null_count() != array.length(),
- Status::Invalid("null_count is invalid"));
- return Status::OK();
- }
+struct ArrayDataValidator {
Review comment:
Nit: just to make it clear to which public function this visitor
corresponds (and it doesn't actually touch array data)
```suggestion
struct ValidateArraympl {
```
##########
File path: cpp/src/arrow/array/validate.cc
##########
@@ -38,195 +39,172 @@ namespace internal {
namespace {
-struct ValidateArrayVisitor {
- Status Visit(const NullArray& array) {
- ARROW_RETURN_IF(array.null_count() != array.length(),
- Status::Invalid("null_count is invalid"));
- return Status::OK();
- }
+struct ArrayDataValidator {
+ const ArrayData& data;
- Status Visit(const PrimitiveArray& array) {
- if (array.length() > 0) {
- if (array.data()->buffers[1] == nullptr) {
- return Status::Invalid("values buffer is null");
- }
- if (array.values() == nullptr) {
- return Status::Invalid("values is null");
- }
- }
- return Status::OK();
- }
+ Status Validate() { return ValidateWithType(*data.type); }
+
+ Status ValidateWithType(const DataType& type) { return VisitTypeInline(type,
this); }
- Status Visit(const Decimal128Array& array) {
- if (array.length() > 0 && array.values() == nullptr) {
- return Status::Invalid("values is null");
+ Status Visit(const NullType&) {
+ if (data.null_count != data.length) {
+ return Status::Invalid("Null array null_count unequal to its length");
}
return Status::OK();
}
- Status Visit(const Decimal256Array& array) {
- if (array.length() > 0 && array.values() == nullptr) {
- return Status::Invalid("values is null");
+ Status Visit(const FixedWidthType&) {
+ if (data.length > 0) {
+ if (!IsBufferValid(1)) {
+ return Status::Invalid("Missing values buffer in non-empty array");
+ }
}
return Status::OK();
}
- Status Visit(const StringArray& array) { return ValidateBinaryArray(array); }
+ Status Visit(const StringType& type) { return ValidateBinaryLike(type); }
- Status Visit(const BinaryArray& array) { return ValidateBinaryArray(array); }
+ Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
- Status Visit(const LargeStringArray& array) { return
ValidateBinaryArray(array); }
+ Status Visit(const LargeStringType& type) { return ValidateBinaryLike(type);
}
- Status Visit(const LargeBinaryArray& array) { return
ValidateBinaryArray(array); }
+ Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type);
}
- Status Visit(const ListArray& array) { return ValidateListArray(array); }
+ Status Visit(const ListType& type) { return ValidateListLike(type); }
- Status Visit(const LargeListArray& array) { return ValidateListArray(array);
}
+ Status Visit(const LargeListType& type) { return ValidateListLike(type); }
- Status Visit(const MapArray& array) {
- if (!array.keys()) {
- return Status::Invalid("keys is null");
- }
- return ValidateListArray(array);
- }
+ Status Visit(const MapType& type) { return ValidateListLike(type); }
- Status Visit(const FixedSizeListArray& array) {
- const int64_t len = array.length();
- const int64_t value_size = array.value_length();
- if (len > 0 && !array.values()) {
- return Status::Invalid("values is null");
- }
- if (value_size < 0) {
- return Status::Invalid("FixedSizeListArray has negative value size ",
value_size);
+ Status Visit(const FixedSizeListType& type) {
+ const ArrayData& values = *data.child_data[0];
+ const int64_t list_size = type.list_size();
+ if (list_size < 0) {
+ return Status::Invalid("Fixed size list has negative list size");
}
+
int64_t expected_values_length = -1;
- if (MultiplyWithOverflow(len, value_size, &expected_values_length) ||
- array.values()->length() != expected_values_length) {
- return Status::Invalid("Values Length (", array.values()->length(),
- ") is not equal to the length (", len,
- ") multiplied by the value size (", value_size,
")");
+ if (MultiplyWithOverflow(data.length, list_size, &expected_values_length)
||
+ values.length != expected_values_length) {
+ return Status::Invalid("Values length (", values.length,
+ ") is not equal to the length (", data.length,
+ ") multiplied by the value size (", list_size,
")");
+ }
+
+ const Status child_valid = ValidateArray(values);
+ if (!child_valid.ok()) {
+ return Status::Invalid("Fixed size list child array invalid: ",
+ child_valid.ToString());
}
return Status::OK();
}
- Status Visit(const StructArray& array) {
- const auto& struct_type = checked_cast<const StructType&>(*array.type());
- // Validate fields
- for (int i = 0; i < array.num_fields(); ++i) {
- // array.field() may crash due to an assertion in ArrayData::Slice(),
- // so check invariants before
- const auto& field_data = *array.data()->child_data[i];
- if (field_data.length < array.offset()) {
- return Status::Invalid("Struct child array #", i,
- " has length smaller than struct array offset
(",
- field_data.length, " < ", array.offset(), ")");
- }
+ Status Visit(const StructType& type) {
+ for (int i = 0; i < type.num_fields(); ++i) {
+ const auto& field_data = *data.child_data[i];
- auto it = array.field(i);
- if (it->length() != array.length()) {
+ // Validate child first, to catch nonsensical length / offset etc.
+ const Status field_valid = ValidateArray(field_data);
+ if (!field_valid.ok()) {
return Status::Invalid("Struct child array #", i,
- " has length different from struct array (",
it->length(),
- " != ", array.length(), ")");
+ " invalid: ", field_valid.ToString());
}
- auto it_type = struct_type.field(i)->type();
- if (!it->type()->Equals(it_type)) {
+ if (field_data.length < data.length + data.offset) {
return Status::Invalid("Struct child array #", i,
- " does not match type field: ",
it->type()->ToString(),
- " vs ", it_type->ToString());
+ " has length smaller than struct array (",
+ field_data.length, " != ", data.length +
data.offset, ")");
Review comment:
Users might get confused that what we're implying is the struct array's
length includes its offset:
```suggestion
" has length smaller than the slice viewed by
struct array (",
field_data.length, " < ", data.length +
data.offset, ")");
```
##########
File path: cpp/src/arrow/array/validate.cc
##########
@@ -392,96 +376,159 @@ Status ValidateArray(const Array& array) {
type.ToString());
}
- ValidateArrayVisitor visitor;
- return VisitArrayInline(array, &visitor);
+ ArrayDataValidator validator{data};
+ return validator.Validate();
}
+ARROW_EXPORT
+Status ValidateArray(const Array& array) { return
ValidateArray(*array.data()); }
+
///////////////////////////////////////////////////////////////////////////
-// ValidateArrayData: expensive validation checks
+// ValidateArrayFull: expensive validation checks
namespace {
-struct BoundsCheckVisitor {
- int64_t min_value_;
- int64_t max_value_;
+struct UTF8DataValidator {
+ const ArrayData& data;
- Status Visit(const Array& array) {
+ Status Visit(const DataType&) {
// Default, should be unreachable
return Status::NotImplemented("");
}
- template <typename T>
- Status Visit(const NumericArray<T>& array) {
- for (int64_t i = 0; i < array.length(); ++i) {
- if (!array.IsNull(i)) {
- const auto v = static_cast<int64_t>(array.Value(i));
- if (v < min_value_ || v > max_value_) {
- return Status::Invalid("Value at position ", i, " out of bounds: ",
v,
- " (should be in [", min_value_, ", ",
max_value_, "])");
- }
- }
- }
- return Status::OK();
+ template <typename StringType>
+ enable_if_string<StringType, Status> Visit(const StringType&) {
+ util::InitializeUTF8();
+
+ int64_t i = 0;
+ return VisitArrayDataInline<StringType>(
+ data,
+ [&](util::string_view v) {
+ if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
+ return Status::Invalid("Invalid UTF8 sequence at string index ",
i);
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ });
}
};
-struct ValidateArrayDataVisitor {
- // Fallback
- Status Visit(const Array& array) { return Status::OK(); }
+struct BoundsChecker {
+ const ArrayData& data;
+ int64_t min_value;
+ int64_t max_value;
- Status Visit(const StringArray& array) {
- RETURN_NOT_OK(ValidateBinaryArray(array));
- return array.ValidateUTF8();
+ Status Visit(const DataType&) {
+ // Default, should be unreachable
+ return Status::NotImplemented("");
}
- Status Visit(const LargeStringArray& array) {
- RETURN_NOT_OK(ValidateBinaryArray(array));
- return array.ValidateUTF8();
+ template <typename IntegerType>
+ enable_if_integer<IntegerType, Status> Visit(const IntegerType&) {
+ using c_type = typename IntegerType::c_type;
+
+ int64_t i = 0;
+ return VisitArrayDataInline<IntegerType>(
+ data,
+ [&](c_type value) {
+ const auto v = static_cast<int64_t>(value);
+ if (ARROW_PREDICT_FALSE(v < min_value || v > max_value)) {
+ return Status::Invalid("Value at position ", i, " out of bounds:
", v,
+ " (should be in [", min_value, ", ",
max_value, "])");
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ });
}
+};
+
+struct ArrayDataFullValidator {
Review comment:
As above,
```suggestion
struct ValidateArrayFullImpl {
```
##########
File path: cpp/src/arrow/array/validate.cc
##########
@@ -392,96 +376,159 @@ Status ValidateArray(const Array& array) {
type.ToString());
}
- ValidateArrayVisitor visitor;
- return VisitArrayInline(array, &visitor);
+ ArrayDataValidator validator{data};
+ return validator.Validate();
}
+ARROW_EXPORT
+Status ValidateArray(const Array& array) { return
ValidateArray(*array.data()); }
+
///////////////////////////////////////////////////////////////////////////
-// ValidateArrayData: expensive validation checks
+// ValidateArrayFull: expensive validation checks
namespace {
-struct BoundsCheckVisitor {
- int64_t min_value_;
- int64_t max_value_;
+struct UTF8DataValidator {
+ const ArrayData& data;
- Status Visit(const Array& array) {
+ Status Visit(const DataType&) {
// Default, should be unreachable
return Status::NotImplemented("");
}
- template <typename T>
- Status Visit(const NumericArray<T>& array) {
- for (int64_t i = 0; i < array.length(); ++i) {
- if (!array.IsNull(i)) {
- const auto v = static_cast<int64_t>(array.Value(i));
- if (v < min_value_ || v > max_value_) {
- return Status::Invalid("Value at position ", i, " out of bounds: ",
v,
- " (should be in [", min_value_, ", ",
max_value_, "])");
- }
- }
- }
- return Status::OK();
+ template <typename StringType>
+ enable_if_string<StringType, Status> Visit(const StringType&) {
+ util::InitializeUTF8();
+
+ int64_t i = 0;
+ return VisitArrayDataInline<StringType>(
+ data,
+ [&](util::string_view v) {
+ if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
+ return Status::Invalid("Invalid UTF8 sequence at string index ",
i);
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ });
}
};
-struct ValidateArrayDataVisitor {
- // Fallback
- Status Visit(const Array& array) { return Status::OK(); }
+struct BoundsChecker {
+ const ArrayData& data;
+ int64_t min_value;
+ int64_t max_value;
- Status Visit(const StringArray& array) {
- RETURN_NOT_OK(ValidateBinaryArray(array));
- return array.ValidateUTF8();
+ Status Visit(const DataType&) {
+ // Default, should be unreachable
+ return Status::NotImplemented("");
}
- Status Visit(const LargeStringArray& array) {
- RETURN_NOT_OK(ValidateBinaryArray(array));
- return array.ValidateUTF8();
+ template <typename IntegerType>
+ enable_if_integer<IntegerType, Status> Visit(const IntegerType&) {
+ using c_type = typename IntegerType::c_type;
+
+ int64_t i = 0;
+ return VisitArrayDataInline<IntegerType>(
+ data,
+ [&](c_type value) {
+ const auto v = static_cast<int64_t>(value);
+ if (ARROW_PREDICT_FALSE(v < min_value || v > max_value)) {
+ return Status::Invalid("Value at position ", i, " out of bounds:
", v,
+ " (should be in [", min_value, ", ",
max_value, "])");
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ });
}
+};
+
+struct ArrayDataFullValidator {
+ const ArrayData& data;
- Status Visit(const BinaryArray& array) { return ValidateBinaryArray(array); }
+ Status Validate() { return ValidateWithType(*data.type); }
- Status Visit(const LargeBinaryArray& array) { return
ValidateBinaryArray(array); }
+ Status ValidateWithType(const DataType& type) { return VisitTypeInline(type,
this); }
- Status Visit(const ListArray& array) { return ValidateListArray(array); }
+ Status Visit(const NullType& type) { return Status::OK(); }
- Status Visit(const LargeListArray& array) { return ValidateListArray(array);
}
+ Status Visit(const FixedWidthType& type) { return Status::OK(); }
- Status Visit(const MapArray& array) {
- // TODO check keys and items individually?
- return ValidateListArray(array);
+ Status Visit(const StringType& type) {
+ RETURN_NOT_OK(ValidateBinaryLike(type));
+ return ValidateUTF8(data);
}
- Status Visit(const UnionArray& array) {
- const auto& child_ids = array.union_type()->child_ids();
+ Status Visit(const LargeStringType& type) {
+ RETURN_NOT_OK(ValidateBinaryLike(type));
+ return ValidateUTF8(data);
+ }
- const int8_t* type_codes = array.raw_type_codes();
- for (int64_t i = 0; i < array.length(); ++i) {
- if (array.IsNull(i)) {
- continue;
+ Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
+
+ Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type);
}
+
+ Status Visit(const ListType& type) { return ValidateListLike(type); }
+
+ Status Visit(const LargeListType& type) { return ValidateListLike(type); }
+
+ Status Visit(const MapType& type) { return ValidateListLike(type); }
+
+ Status Visit(const FixedSizeListType& type) {
+ const ArrayData& child = *data.child_data[0];
+ const Status child_valid = ValidateArrayFull(child);
+ if (!child_valid.ok()) {
+ return Status::Invalid("Fixed size list child array invalid: ",
+ child_valid.ToString());
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ // Validate children
+ for (int64_t i = 0; i < type.num_fields(); ++i) {
+ const ArrayData& field = *data.child_data[i];
+ const Status field_valid = ValidateArrayFull(field);
+ if (!field_valid.ok()) {
+ return Status::Invalid("Struct child array #", i,
+ " invalid: ", field_valid.ToString());
}
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const auto& type_codes_map = type.type_codes();
+
+ const int8_t* type_codes = data.GetValues<int8_t>(1);
+
+ for (int64_t i = 0; i < data.length; ++i) {
+ // Note that union arrays never have top-level nulls
const int32_t code = type_codes[i];
if (code < 0 || child_ids[code] == UnionType::kInvalidChildId) {
return Status::Invalid("Union value at position ", i, " has invalid
type id ",
code);
}
}
- if (array.mode() == UnionMode::DENSE) {
+ if (type.mode() == UnionMode::DENSE) {
// Map logical type id to child length
std::vector<int64_t> child_lengths(256);
- const auto& type_codes_map = array.union_type()->type_codes();
- for (int child_id = 0; child_id < array.type()->num_fields();
++child_id) {
- child_lengths[type_codes_map[child_id]] =
array.field(child_id)->length();
+ for (int child_id = 0; child_id < type.num_fields(); ++child_id) {
+ child_lengths[type_codes_map[child_id]] =
data.child_data[child_id]->length;
}
- // Check offsets
- const int32_t* offsets =
- checked_cast<const DenseUnionArray&>(array).raw_value_offsets();
- for (int64_t i = 0; i < array.length(); ++i) {
- if (array.IsNull(i)) {
- continue;
- }
+ // Check offsets are in bounds
+ const int32_t* offsets = data.GetValues<int32_t>(2);
+ for (int64_t i = 0; i < data.length; ++i) {
const int32_t code = type_codes[i];
const int32_t offset = offsets[i];
if (offset < 0) {
Review comment:
We should probably also validate that the offsets are strictly increasing
##########
File path: cpp/src/arrow/array/validate.cc
##########
@@ -392,96 +376,159 @@ Status ValidateArray(const Array& array) {
type.ToString());
}
- ValidateArrayVisitor visitor;
- return VisitArrayInline(array, &visitor);
+ ArrayDataValidator validator{data};
+ return validator.Validate();
}
+ARROW_EXPORT
+Status ValidateArray(const Array& array) { return
ValidateArray(*array.data()); }
+
///////////////////////////////////////////////////////////////////////////
-// ValidateArrayData: expensive validation checks
+// ValidateArrayFull: expensive validation checks
namespace {
-struct BoundsCheckVisitor {
- int64_t min_value_;
- int64_t max_value_;
+struct UTF8DataValidator {
+ const ArrayData& data;
- Status Visit(const Array& array) {
+ Status Visit(const DataType&) {
// Default, should be unreachable
return Status::NotImplemented("");
}
- template <typename T>
- Status Visit(const NumericArray<T>& array) {
- for (int64_t i = 0; i < array.length(); ++i) {
- if (!array.IsNull(i)) {
- const auto v = static_cast<int64_t>(array.Value(i));
- if (v < min_value_ || v > max_value_) {
- return Status::Invalid("Value at position ", i, " out of bounds: ",
v,
- " (should be in [", min_value_, ", ",
max_value_, "])");
- }
- }
- }
- return Status::OK();
+ template <typename StringType>
+ enable_if_string<StringType, Status> Visit(const StringType&) {
+ util::InitializeUTF8();
+
+ int64_t i = 0;
+ return VisitArrayDataInline<StringType>(
+ data,
+ [&](util::string_view v) {
+ if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
+ return Status::Invalid("Invalid UTF8 sequence at string index ",
i);
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ });
}
};
-struct ValidateArrayDataVisitor {
- // Fallback
- Status Visit(const Array& array) { return Status::OK(); }
+struct BoundsChecker {
+ const ArrayData& data;
+ int64_t min_value;
+ int64_t max_value;
- Status Visit(const StringArray& array) {
- RETURN_NOT_OK(ValidateBinaryArray(array));
- return array.ValidateUTF8();
+ Status Visit(const DataType&) {
+ // Default, should be unreachable
+ return Status::NotImplemented("");
}
- Status Visit(const LargeStringArray& array) {
- RETURN_NOT_OK(ValidateBinaryArray(array));
- return array.ValidateUTF8();
+ template <typename IntegerType>
+ enable_if_integer<IntegerType, Status> Visit(const IntegerType&) {
+ using c_type = typename IntegerType::c_type;
+
+ int64_t i = 0;
+ return VisitArrayDataInline<IntegerType>(
+ data,
+ [&](c_type value) {
+ const auto v = static_cast<int64_t>(value);
+ if (ARROW_PREDICT_FALSE(v < min_value || v > max_value)) {
+ return Status::Invalid("Value at position ", i, " out of bounds:
", v,
+ " (should be in [", min_value, ", ",
max_value, "])");
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ });
}
+};
+
+struct ArrayDataFullValidator {
+ const ArrayData& data;
- Status Visit(const BinaryArray& array) { return ValidateBinaryArray(array); }
+ Status Validate() { return ValidateWithType(*data.type); }
- Status Visit(const LargeBinaryArray& array) { return
ValidateBinaryArray(array); }
+ Status ValidateWithType(const DataType& type) { return VisitTypeInline(type,
this); }
- Status Visit(const ListArray& array) { return ValidateListArray(array); }
+ Status Visit(const NullType& type) { return Status::OK(); }
- Status Visit(const LargeListArray& array) { return ValidateListArray(array);
}
+ Status Visit(const FixedWidthType& type) { return Status::OK(); }
- Status Visit(const MapArray& array) {
- // TODO check keys and items individually?
- return ValidateListArray(array);
+ Status Visit(const StringType& type) {
+ RETURN_NOT_OK(ValidateBinaryLike(type));
+ return ValidateUTF8(data);
}
- Status Visit(const UnionArray& array) {
- const auto& child_ids = array.union_type()->child_ids();
+ Status Visit(const LargeStringType& type) {
+ RETURN_NOT_OK(ValidateBinaryLike(type));
+ return ValidateUTF8(data);
+ }
- const int8_t* type_codes = array.raw_type_codes();
- for (int64_t i = 0; i < array.length(); ++i) {
- if (array.IsNull(i)) {
- continue;
+ Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
+
+ Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type);
}
+
+ Status Visit(const ListType& type) { return ValidateListLike(type); }
+
+ Status Visit(const LargeListType& type) { return ValidateListLike(type); }
+
+ Status Visit(const MapType& type) { return ValidateListLike(type); }
+
+ Status Visit(const FixedSizeListType& type) {
+ const ArrayData& child = *data.child_data[0];
+ const Status child_valid = ValidateArrayFull(child);
+ if (!child_valid.ok()) {
+ return Status::Invalid("Fixed size list child array invalid: ",
+ child_valid.ToString());
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ // Validate children
+ for (int64_t i = 0; i < type.num_fields(); ++i) {
+ const ArrayData& field = *data.child_data[i];
+ const Status field_valid = ValidateArrayFull(field);
+ if (!field_valid.ok()) {
+ return Status::Invalid("Struct child array #", i,
+ " invalid: ", field_valid.ToString());
}
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const auto& type_codes_map = type.type_codes();
+
+ const int8_t* type_codes = data.GetValues<int8_t>(1);
+
+ for (int64_t i = 0; i < data.length; ++i) {
+ // Note that union arrays never have top-level nulls
const int32_t code = type_codes[i];
if (code < 0 || child_ids[code] == UnionType::kInvalidChildId) {
return Status::Invalid("Union value at position ", i, " has invalid
type id ",
code);
}
}
Review comment:
Should we also check that no type code is larger than the largest type
code in type_codes_map?
```suggestion
const auto& type_codes_map = type.type_codes();
if (type_codes_map.empty()) return Status::OK();
int8_t max_type_code = *std::max_element(type_codes_map.begin(),
type_codes_map.end());
const int8_t* type_codes = data.GetValues<int8_t>(1);
for (int64_t i = 0; i < data.length; ++i) {
// Note that union arrays never have top-level nulls
const int32_t code = type_codes[i];
if (code < 0 || code > max_type_code || child_ids[code] ==
UnionType::kInvalidChildId) {
return Status::Invalid("Union value at position ", i, " has invalid
type id ",
code);
}
}
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]