This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 35b04ca Fix undefined behavor in GenericStringArray::from_iter_values
(#1145)
35b04ca is described below
commit 35b04ca3ee40836ca9666fe0127c7da40f546374
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Jan 10 16:47:06 2022 -0500
Fix undefined behavor in GenericStringArray::from_iter_values (#1145)
* Fix undefined behavor in GenericStringArray::from_iter_values
* Cleanup code and tests
* clippy
* Fix test
---
arrow/src/array/array_string.rs | 73 ++++++++++++++++++++++++++++++++++++++++-
arrow/src/util/test_util.rs | 48 +++++++++++++++++++++++++++
2 files changed, 120 insertions(+), 1 deletion(-)
diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs
index 82c9f16..d1826ef 100644
--- a/arrow/src/array/array_string.rs
+++ b/arrow/src/array/array_string.rs
@@ -187,8 +187,13 @@ impl<OffsetSize: StringOffsetSizeTrait>
GenericStringArray<OffsetSize> {
offsets.push(length_so_far);
values.extend_from_slice(s.as_bytes());
}
+
+ // iterator size hint may not be correct so compute the actual number
of offsets
+ assert!(!offsets.is_empty()); // wrote at least one
+ let actual_len = (offsets.len() / std::mem::size_of::<OffsetSize>()) -
1;
+
let array_data = ArrayData::builder(OffsetSize::DATA_TYPE)
- .len(data_len)
+ .len(actual_len)
.add_buffer(offsets.into())
.add_buffer(values.into());
let array_data = unsafe { array_data.build_unchecked() };
@@ -572,4 +577,70 @@ mod tests {
.validate_full()
.expect("All null array has valid array data");
}
+
+ #[cfg(feature = "test_utils")]
+ #[test]
+ fn bad_size_collect_string() {
+ use crate::util::test_util::BadIterator;
+ let data = vec![Some("foo"), None, Some("bar")];
+ let expected: StringArray = data.clone().into_iter().collect();
+
+ // Iterator reports too many items
+ let arr: StringArray = BadIterator::new(3, 10, data.clone()).collect();
+ assert_eq!(expected, arr);
+
+ // Iterator reports too few items
+ let arr: StringArray = BadIterator::new(3, 1, data.clone()).collect();
+ assert_eq!(expected, arr);
+ }
+
+ #[cfg(feature = "test_utils")]
+ #[test]
+ fn bad_size_collect_large_string() {
+ use crate::util::test_util::BadIterator;
+ let data = vec![Some("foo"), None, Some("bar")];
+ let expected: LargeStringArray = data.clone().into_iter().collect();
+
+ // Iterator reports too many items
+ let arr: LargeStringArray = BadIterator::new(3, 10,
data.clone()).collect();
+ assert_eq!(expected, arr);
+
+ // Iterator reports too few items
+ let arr: LargeStringArray = BadIterator::new(3, 1,
data.clone()).collect();
+ assert_eq!(expected, arr);
+ }
+
+ #[cfg(feature = "test_utils")]
+ #[test]
+ fn bad_size_iter_values_string() {
+ use crate::util::test_util::BadIterator;
+ let data = vec!["foo", "bar", "baz"];
+ let expected: StringArray =
data.clone().into_iter().map(Some).collect();
+
+ // Iterator reports too many items
+ let arr = StringArray::from_iter_values(BadIterator::new(3, 10,
data.clone()));
+ assert_eq!(expected, arr);
+
+ // Iterator reports too few items
+ let arr = StringArray::from_iter_values(BadIterator::new(3, 1,
data.clone()));
+ assert_eq!(expected, arr);
+ }
+
+ #[cfg(feature = "test_utils")]
+ #[test]
+ fn bad_size_iter_values_large_string() {
+ use crate::util::test_util::BadIterator;
+ let data = vec!["foo", "bar", "baz"];
+ let expected: LargeStringArray =
data.clone().into_iter().map(Some).collect();
+
+ // Iterator reports too many items
+ let arr =
+ LargeStringArray::from_iter_values(BadIterator::new(3, 10,
data.clone()));
+ assert_eq!(expected, arr);
+
+ // Iterator reports too few items
+ let arr =
+ LargeStringArray::from_iter_values(BadIterator::new(3, 1,
data.clone()));
+ assert_eq!(expected, arr);
+ }
}
diff --git a/arrow/src/util/test_util.rs b/arrow/src/util/test_util.rs
index f02c8d0..6b1ad4b 100644
--- a/arrow/src/util/test_util.rs
+++ b/arrow/src/util/test_util.rs
@@ -152,6 +152,54 @@ fn get_data_dir(udf_env: &str, submodule_data: &str) ->
Result<PathBuf, Box<dyn
}
}
+/// An iterator that is untruthful about its actual length
+#[derive(Debug, Clone)]
+pub struct BadIterator<T> {
+ /// where the iterator currently is
+ cur: usize,
+ /// How many items will this iterator *actually* make
+ limit: usize,
+ /// How many items this iterator claims it will make
+ claimed: usize,
+ /// The items to return. If there are fewer items than `limit`
+ /// they will be repeated
+ pub items: Vec<T>,
+}
+
+impl<T> BadIterator<T> {
+ /// Create a new iterator for <limit> items, but that reports to
+ /// produce <claimed> items. Must provide at least 1 item.
+ pub fn new(limit: usize, claimed: usize, items: Vec<T>) -> Self {
+ assert!(!items.is_empty());
+ Self {
+ cur: 0,
+ limit,
+ claimed,
+ items,
+ }
+ }
+}
+
+impl<T: Clone> Iterator for BadIterator<T> {
+ type Item = T;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ if self.cur < self.limit {
+ let next_item_idx = self.cur % self.items.len();
+ let next_item = self.items[next_item_idx].clone();
+ self.cur += 1;
+ Some(next_item)
+ } else {
+ None
+ }
+ }
+
+ /// report whatever the iterator says to
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ (0, Some(self.claimed as usize))
+ }
+}
+
#[cfg(test)]
mod tests {
use super::*;