This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 991eb2af43 fix missing utf8 check for conversion from BinaryViewArray
to StringViewArray (#9158)
991eb2af43 is described below
commit 991eb2af43a1c70c39c936773783baad75be91c0
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Jan 14 17:44:01 2026 -0500
fix missing utf8 check for conversion from BinaryViewArray to
StringViewArray (#9158)
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
- closes https://github.com/apache/arrow-rs/issues/9157
# Rationale for this change
@jhorstmann found it is possible to bypass utf8 validation by abusing
the ArrayData APIs
# What changes are included in this PR?
1. Add an assert to prevent the bypass
2. Add tests
# Are these changes tested?
Yes, new unit tests are added
# Are there any user-facing changes?
error if APIs are misused
---------
Co-authored-by: Martin Hilton <[email protected]>
---
arrow-array/src/array/byte_view_array.rs | 58 +++++++++++++++++++++++++++++---
1 file changed, 53 insertions(+), 5 deletions(-)
diff --git a/arrow-array/src/array/byte_view_array.rs
b/arrow-array/src/array/byte_view_array.rs
index ab6f0cd2d6..c517129a17 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -988,14 +988,18 @@ impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a
GenericByteViewArray<T>
impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
fn from(data: ArrayData) -> Self {
- let (_data_type, len, nulls, offset, buffers, _child_data) =
data.into_parts();
-
+ let (data_type, len, nulls, offset, buffers, _child_data) =
data.into_parts();
+ assert_eq!(
+ data_type,
+ T::DATA_TYPE,
+ "Mismatched data type, expected {}, got {data_type}",
+ T::DATA_TYPE
+ );
let mut buffers = buffers.into_iter();
// first buffer is views, remaining are data buffers
let views = ScalarBuffer::new(buffers.next().unwrap(), offset, len);
-
Self {
- data_type: T::DATA_TYPE,
+ data_type,
views,
buffers: Arc::from_iter(buffers),
nulls,
@@ -1207,9 +1211,11 @@ mod tests {
Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray,
StringViewArray,
};
use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
- use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
+ use arrow_data::{ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
+ use arrow_schema::DataType;
use rand::prelude::StdRng;
use rand::{Rng, SeedableRng};
+ use std::str::from_utf8;
const BLOCK_SIZE: u32 = 8;
@@ -1816,4 +1822,46 @@ mod tests {
assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
}
+
+ #[should_panic(expected = "Mismatched data type, expected Utf8View, got
BinaryView")]
+ #[test]
+ fn invalid_casting_from_array_data() {
+ // Should not be able to cast to StringViewArray due to invalid UTF-8
+ let array_data =
binary_view_array_with_invalid_utf8_data().into_data();
+ let _ = StringViewArray::from(array_data);
+ }
+
+ #[should_panic(expected = "invalid utf-8 sequence")]
+ #[test]
+ fn invalid_array_data() {
+ let (views, buffers, nulls) =
binary_view_array_with_invalid_utf8_data().into_parts();
+
+ // manually try and add invalid array data with Utf8View data type
+ let mut builder = ArrayDataBuilder::new(DataType::Utf8View)
+ .add_buffer(views.into_inner())
+ .len(3);
+ for buffer in buffers.iter() {
+ builder = builder.add_buffer(buffer.clone())
+ }
+ builder = builder.nulls(nulls);
+
+ let data = builder.build().unwrap(); // should fail validation
+ let _arr = StringViewArray::from(data);
+ }
+
+ /// Returns a BinaryViewArray with one invalid UTF-8 value
+ fn binary_view_array_with_invalid_utf8_data() -> BinaryViewArray {
+ let array = GenericByteViewArray::<BinaryViewType>::from(vec![
+ b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
+ &[
+ 0xf0, 0x80, 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00,
+ ],
+ b"good",
+ ]);
+ assert!(from_utf8(array.value(0)).is_ok());
+ assert!(from_utf8(array.value(1)).is_err()); // value 1 is invalid utf8
+ assert!(from_utf8(array.value(2)).is_ok());
+ array
+ }
}