lidavidm commented on code in PR #747:
URL: https://github.com/apache/arrow-go/pull/747#discussion_r3076613632
##########
arrow/array/string.go:
##########
@@ -169,6 +169,55 @@ func (a *String) MarshalJSON() ([]byte, error) {
return json.Marshal(vals)
}
+// Validate performs a basic, O(1) consistency check on the array data.
+// It returns an error if:
+// - The offset buffer is too small for the array length and offset
+// - The last offset exceeds the data buffer length
+//
+// This is useful for detecting corrupted data from untrusted sources (e.g.
+// Arrow Flight / Flight SQL servers) before accessing values, which may
+// otherwise cause a runtime panic.
+func (a *String) Validate() error {
+ if a.data.length == 0 {
+ return nil
+ }
+ if a.data.buffers[1] == nil {
+ return fmt.Errorf("arrow/array: non-empty string array has no
offsets buffer")
+ }
+ expNumOffsets := a.data.offset + a.data.length + 1
+ if len(a.offsets) < expNumOffsets {
+ return fmt.Errorf("arrow/array: string offset buffer must have
at least %d values, got %d", expNumOffsets, len(a.offsets))
+ }
+ lastOffset := int(a.offsets[expNumOffsets-1])
+ if lastOffset > len(a.values) {
+ return fmt.Errorf("arrow/array: string offset %d out of bounds
of data buffer (length %d)", lastOffset, len(a.values))
+ }
Review Comment:
If we're validating last offset here, maybe worth validating first offset
too?
##########
arrow/array/string.go:
##########
@@ -169,6 +169,55 @@ func (a *String) MarshalJSON() ([]byte, error) {
return json.Marshal(vals)
}
+// Validate performs a basic, O(1) consistency check on the array data.
+// It returns an error if:
+// - The offset buffer is too small for the array length and offset
+// - The last offset exceeds the data buffer length
+//
+// This is useful for detecting corrupted data from untrusted sources (e.g.
+// Arrow Flight / Flight SQL servers) before accessing values, which may
+// otherwise cause a runtime panic.
+func (a *String) Validate() error {
+ if a.data.length == 0 {
+ return nil
+ }
+ if a.data.buffers[1] == nil {
+ return fmt.Errorf("arrow/array: non-empty string array has no
offsets buffer")
+ }
+ expNumOffsets := a.data.offset + a.data.length + 1
+ if len(a.offsets) < expNumOffsets {
+ return fmt.Errorf("arrow/array: string offset buffer must have
at least %d values, got %d", expNumOffsets, len(a.offsets))
+ }
+ lastOffset := int(a.offsets[expNumOffsets-1])
+ if lastOffset > len(a.values) {
+ return fmt.Errorf("arrow/array: string offset %d out of bounds
of data buffer (length %d)", lastOffset, len(a.values))
+ }
+ return nil
+}
+
+// ValidateFull performs a full O(n) consistency check on the array data.
+// In addition to the checks performed by Validate, it also verifies that
+// all offsets are non-negative and monotonically non-decreasing.
+func (a *String) ValidateFull() error {
Review Comment:
Shouldn't we also validate that the data is valid UTF-8?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]