This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 35b04ca  Fix undefined behavor in GenericStringArray::from_iter_values 
(#1145)
35b04ca is described below

commit 35b04ca3ee40836ca9666fe0127c7da40f546374
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Jan 10 16:47:06 2022 -0500

    Fix undefined behavor in GenericStringArray::from_iter_values (#1145)
    
    * Fix undefined behavor in GenericStringArray::from_iter_values
    
    * Cleanup code and tests
    
    * clippy
    
    * Fix test
---
 arrow/src/array/array_string.rs | 73 ++++++++++++++++++++++++++++++++++++++++-
 arrow/src/util/test_util.rs     | 48 +++++++++++++++++++++++++++
 2 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs
index 82c9f16..d1826ef 100644
--- a/arrow/src/array/array_string.rs
+++ b/arrow/src/array/array_string.rs
@@ -187,8 +187,13 @@ impl<OffsetSize: StringOffsetSizeTrait> 
GenericStringArray<OffsetSize> {
             offsets.push(length_so_far);
             values.extend_from_slice(s.as_bytes());
         }
+
+        // iterator size hint may not be correct so compute the actual number 
of offsets
+        assert!(!offsets.is_empty()); // wrote at least one
+        let actual_len = (offsets.len() / std::mem::size_of::<OffsetSize>()) - 
1;
+
         let array_data = ArrayData::builder(OffsetSize::DATA_TYPE)
-            .len(data_len)
+            .len(actual_len)
             .add_buffer(offsets.into())
             .add_buffer(values.into());
         let array_data = unsafe { array_data.build_unchecked() };
@@ -572,4 +577,70 @@ mod tests {
             .validate_full()
             .expect("All null array has valid array data");
     }
+
+    #[cfg(feature = "test_utils")]
+    #[test]
+    fn bad_size_collect_string() {
+        use crate::util::test_util::BadIterator;
+        let data = vec![Some("foo"), None, Some("bar")];
+        let expected: StringArray = data.clone().into_iter().collect();
+
+        // Iterator reports too many items
+        let arr: StringArray = BadIterator::new(3, 10, data.clone()).collect();
+        assert_eq!(expected, arr);
+
+        // Iterator reports too few items
+        let arr: StringArray = BadIterator::new(3, 1, data.clone()).collect();
+        assert_eq!(expected, arr);
+    }
+
+    #[cfg(feature = "test_utils")]
+    #[test]
+    fn bad_size_collect_large_string() {
+        use crate::util::test_util::BadIterator;
+        let data = vec![Some("foo"), None, Some("bar")];
+        let expected: LargeStringArray = data.clone().into_iter().collect();
+
+        // Iterator reports too many items
+        let arr: LargeStringArray = BadIterator::new(3, 10, 
data.clone()).collect();
+        assert_eq!(expected, arr);
+
+        // Iterator reports too few items
+        let arr: LargeStringArray = BadIterator::new(3, 1, 
data.clone()).collect();
+        assert_eq!(expected, arr);
+    }
+
+    #[cfg(feature = "test_utils")]
+    #[test]
+    fn bad_size_iter_values_string() {
+        use crate::util::test_util::BadIterator;
+        let data = vec!["foo", "bar", "baz"];
+        let expected: StringArray = 
data.clone().into_iter().map(Some).collect();
+
+        // Iterator reports too many items
+        let arr = StringArray::from_iter_values(BadIterator::new(3, 10, 
data.clone()));
+        assert_eq!(expected, arr);
+
+        // Iterator reports too few items
+        let arr = StringArray::from_iter_values(BadIterator::new(3, 1, 
data.clone()));
+        assert_eq!(expected, arr);
+    }
+
+    #[cfg(feature = "test_utils")]
+    #[test]
+    fn bad_size_iter_values_large_string() {
+        use crate::util::test_util::BadIterator;
+        let data = vec!["foo", "bar", "baz"];
+        let expected: LargeStringArray = 
data.clone().into_iter().map(Some).collect();
+
+        // Iterator reports too many items
+        let arr =
+            LargeStringArray::from_iter_values(BadIterator::new(3, 10, 
data.clone()));
+        assert_eq!(expected, arr);
+
+        // Iterator reports too few items
+        let arr =
+            LargeStringArray::from_iter_values(BadIterator::new(3, 1, 
data.clone()));
+        assert_eq!(expected, arr);
+    }
 }
diff --git a/arrow/src/util/test_util.rs b/arrow/src/util/test_util.rs
index f02c8d0..6b1ad4b 100644
--- a/arrow/src/util/test_util.rs
+++ b/arrow/src/util/test_util.rs
@@ -152,6 +152,54 @@ fn get_data_dir(udf_env: &str, submodule_data: &str) -> 
Result<PathBuf, Box<dyn
     }
 }
 
+/// An iterator that is untruthful about its actual length
+#[derive(Debug, Clone)]
+pub struct BadIterator<T> {
+    /// where the iterator currently is
+    cur: usize,
+    /// How many items will this iterator *actually* make
+    limit: usize,
+    /// How many items this iterator claims it will make
+    claimed: usize,
+    /// The items to return. If there are fewer items than `limit`
+    /// they will be repeated
+    pub items: Vec<T>,
+}
+
+impl<T> BadIterator<T> {
+    /// Create a new iterator for <limit> items, but that reports to
+    /// produce <claimed> items. Must provide at least 1 item.
+    pub fn new(limit: usize, claimed: usize, items: Vec<T>) -> Self {
+        assert!(!items.is_empty());
+        Self {
+            cur: 0,
+            limit,
+            claimed,
+            items,
+        }
+    }
+}
+
+impl<T: Clone> Iterator for BadIterator<T> {
+    type Item = T;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.cur < self.limit {
+            let next_item_idx = self.cur % self.items.len();
+            let next_item = self.items[next_item_idx].clone();
+            self.cur += 1;
+            Some(next_item)
+        } else {
+            None
+        }
+    }
+
+    /// report whatever the iterator says to
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (0, Some(self.claimed as usize))
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

Reply via email to