bkietz commented on a change in pull request #8652:
URL: https://github.com/apache/arrow/pull/8652#discussion_r522384657



##########
File path: cpp/src/arrow/array/validate.cc
##########
@@ -392,96 +376,159 @@ Status ValidateArray(const Array& array) {
                            type.ToString());
   }
 
-  ValidateArrayVisitor visitor;
-  return VisitArrayInline(array, &visitor);
+  ArrayDataValidator validator{data};
+  return validator.Validate();
 }
 
+ARROW_EXPORT
+Status ValidateArray(const Array& array) { return 
ValidateArray(*array.data()); }
+
 ///////////////////////////////////////////////////////////////////////////
-// ValidateArrayData: expensive validation checks
+// ValidateArrayFull: expensive validation checks
 
 namespace {
 
-struct BoundsCheckVisitor {
-  int64_t min_value_;
-  int64_t max_value_;
+struct UTF8DataValidator {
+  const ArrayData& data;
 
-  Status Visit(const Array& array) {
+  Status Visit(const DataType&) {
     // Default, should be unreachable
     return Status::NotImplemented("");
   }
 
-  template <typename T>
-  Status Visit(const NumericArray<T>& array) {
-    for (int64_t i = 0; i < array.length(); ++i) {
-      if (!array.IsNull(i)) {
-        const auto v = static_cast<int64_t>(array.Value(i));
-        if (v < min_value_ || v > max_value_) {
-          return Status::Invalid("Value at position ", i, " out of bounds: ", 
v,
-                                 " (should be in [", min_value_, ", ", 
max_value_, "])");
-        }
-      }
-    }
-    return Status::OK();
+  template <typename StringType>
+  enable_if_string<StringType, Status> Visit(const StringType&) {
+    util::InitializeUTF8();
+
+    int64_t i = 0;
+    return VisitArrayDataInline<StringType>(
+        data,
+        [&](util::string_view v) {
+          if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
+            return Status::Invalid("Invalid UTF8 sequence at string index ", 
i);
+          }
+          ++i;
+          return Status::OK();
+        },
+        [&]() {
+          ++i;
+          return Status::OK();
+        });
   }
 };
 
-struct ValidateArrayDataVisitor {
-  // Fallback
-  Status Visit(const Array& array) { return Status::OK(); }
+struct BoundsChecker {
+  const ArrayData& data;
+  int64_t min_value;
+  int64_t max_value;
 
-  Status Visit(const StringArray& array) {
-    RETURN_NOT_OK(ValidateBinaryArray(array));
-    return array.ValidateUTF8();
+  Status Visit(const DataType&) {
+    // Default, should be unreachable
+    return Status::NotImplemented("");
   }
 
-  Status Visit(const LargeStringArray& array) {
-    RETURN_NOT_OK(ValidateBinaryArray(array));
-    return array.ValidateUTF8();
+  template <typename IntegerType>
+  enable_if_integer<IntegerType, Status> Visit(const IntegerType&) {
+    using c_type = typename IntegerType::c_type;
+
+    int64_t i = 0;
+    return VisitArrayDataInline<IntegerType>(
+        data,
+        [&](c_type value) {
+          const auto v = static_cast<int64_t>(value);
+          if (ARROW_PREDICT_FALSE(v < min_value || v > max_value)) {
+            return Status::Invalid("Value at position ", i, " out of bounds: 
", v,
+                                   " (should be in [", min_value, ", ", 
max_value, "])");
+          }
+          ++i;
+          return Status::OK();
+        },
+        [&]() {
+          ++i;
+          return Status::OK();
+        });
   }
+};
+
+struct ArrayDataFullValidator {
+  const ArrayData& data;
 
-  Status Visit(const BinaryArray& array) { return ValidateBinaryArray(array); }
+  Status Validate() { return ValidateWithType(*data.type); }
 
-  Status Visit(const LargeBinaryArray& array) { return 
ValidateBinaryArray(array); }
+  Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, 
this); }
 
-  Status Visit(const ListArray& array) { return ValidateListArray(array); }
+  Status Visit(const NullType& type) { return Status::OK(); }
 
-  Status Visit(const LargeListArray& array) { return ValidateListArray(array); 
}
+  Status Visit(const FixedWidthType& type) { return Status::OK(); }
 
-  Status Visit(const MapArray& array) {
-    // TODO check keys and items individually?
-    return ValidateListArray(array);
+  Status Visit(const StringType& type) {
+    RETURN_NOT_OK(ValidateBinaryLike(type));
+    return ValidateUTF8(data);
   }
 
-  Status Visit(const UnionArray& array) {
-    const auto& child_ids = array.union_type()->child_ids();
+  Status Visit(const LargeStringType& type) {
+    RETURN_NOT_OK(ValidateBinaryLike(type));
+    return ValidateUTF8(data);
+  }
 
-    const int8_t* type_codes = array.raw_type_codes();
-    for (int64_t i = 0; i < array.length(); ++i) {
-      if (array.IsNull(i)) {
-        continue;
+  Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
+
+  Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); 
}
+
+  Status Visit(const ListType& type) { return ValidateListLike(type); }
+
+  Status Visit(const LargeListType& type) { return ValidateListLike(type); }
+
+  Status Visit(const MapType& type) { return ValidateListLike(type); }
+
+  Status Visit(const FixedSizeListType& type) {
+    const ArrayData& child = *data.child_data[0];
+    const Status child_valid = ValidateArrayFull(child);
+    if (!child_valid.ok()) {
+      return Status::Invalid("Fixed size list child array invalid: ",
+                             child_valid.ToString());
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const StructType& type) {
+    // Validate children
+    for (int64_t i = 0; i < type.num_fields(); ++i) {
+      const ArrayData& field = *data.child_data[i];
+      const Status field_valid = ValidateArrayFull(field);
+      if (!field_valid.ok()) {
+        return Status::Invalid("Struct child array #", i,
+                               " invalid: ", field_valid.ToString());
       }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const UnionType& type) {
+    const auto& child_ids = type.child_ids();
+    const auto& type_codes_map = type.type_codes();
+
+    const int8_t* type_codes = data.GetValues<int8_t>(1);
+
+    for (int64_t i = 0; i < data.length; ++i) {
+      // Note that union arrays never have top-level nulls
       const int32_t code = type_codes[i];
       if (code < 0 || child_ids[code] == UnionType::kInvalidChildId) {
         return Status::Invalid("Union value at position ", i, " has invalid 
type id ",
                                code);
       }
     }
 
-    if (array.mode() == UnionMode::DENSE) {
+    if (type.mode() == UnionMode::DENSE) {
       // Map logical type id to child length
       std::vector<int64_t> child_lengths(256);
-      const auto& type_codes_map = array.union_type()->type_codes();
-      for (int child_id = 0; child_id < array.type()->num_fields(); 
++child_id) {
-        child_lengths[type_codes_map[child_id]] = 
array.field(child_id)->length();
+      for (int child_id = 0; child_id < type.num_fields(); ++child_id) {
+        child_lengths[type_codes_map[child_id]] = 
data.child_data[child_id]->length;
       }
 
-      // Check offsets
-      const int32_t* offsets =
-          checked_cast<const DenseUnionArray&>(array).raw_value_offsets();
-      for (int64_t i = 0; i < array.length(); ++i) {
-        if (array.IsNull(i)) {
-          continue;
-        }
+      // Check offsets are in bounds
+      const int32_t* offsets = data.GetValues<int32_t>(2);
+      for (int64_t i = 0; i < data.length; ++i) {
         const int32_t code = type_codes[i];
         const int32_t offset = offsets[i];
         if (offset < 0) {

Review comment:
       https://arrow.apache.org/docs/format/Columnar.html#dense-union
   
   "The respective offsets for each child value array must be in order / 
increasing."




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to