bkietz commented on code in PR #37792:
URL: https://github.com/apache/arrow/pull/37792#discussion_r1352833751


##########
cpp/src/arrow/array/validate.cc:
##########
@@ -595,6 +614,74 @@ struct ValidateArrayImpl {
     return Status::OK();
   }
 
+  Status ValidateBinaryView(const BinaryViewType& type) {
+    int64_t views_byte_size = data.buffers[1]->size();
+    int64_t required_view_count = data.length + data.offset;
+    if (static_cast<int64_t>(views_byte_size / BinaryViewType::kSize) <
+        required_view_count) {
+      return Status::Invalid("View buffer size (bytes): ", views_byte_size,
+                             " isn't large enough for length: ", data.length,
+                             " and offset: ", data.offset);
+    }
+
+    if (!full_validation) return Status::OK();
+
+    auto CheckPrefix = [&](size_t i,
+                           std::array<uint8_t, BinaryViewType::kPrefixSize> 
prefix,
+                           const uint8_t* data) {
+      if (std::memcmp(data, prefix.data(), BinaryViewType::kPrefixSize) == 0) {
+        return Status::OK();
+      }
+      return Status::Invalid("View at slot ", i, " has inlined prefix 0x",
+                             HexEncode(prefix.data(), 
BinaryViewType::kPrefixSize),
+                             " but the out-of-line data begins with 0x",
+                             HexEncode(data, BinaryViewType::kPrefixSize));
+    };
+
+    util::span views(data.GetValues<BinaryViewType::c_type>(1),
+                     static_cast<size_t>(data.length));
+    util::span data_buffers(data.buffers.data() + 2, data.buffers.size() - 2);
+
+    for (size_t i = 0; i < static_cast<size_t>(data.length); ++i) {
+      if (data.IsNull(i)) continue;
+
+      if (views[i].size() < 0) {
+        return Status::Invalid("View at slot ", i, " has negative size ",
+                               views[i].size());
+      }
+
+      if (views[i].is_inline()) continue;
+
+      auto [size, prefix, buffer_index, offset] = views[i].ref;
+
+      if (buffer_index < 0) {
+        return Status::Invalid("View at slot ", i, " has negative buffer index 
",
+                               buffer_index);
+      }
+
+      if (offset < 0) {
+        return Status::Invalid("View at slot ", i, " has negative offset ", 
offset);
+      }
+
+      if (static_cast<size_t>(buffer_index) >= data_buffers.size()) {

Review Comment:
   `buffer_index` might be zero; data_buffers is the span of buffers after the 
first two so it is directly indexed by `buffer_index`. Checking that there are 
at least two buffers is handled in the non-full validation



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to