neilconway commented on code in PR #20754:
URL: https://github.com/apache/datafusion/pull/20754#discussion_r2955908724


##########
datafusion/functions/benches/strpos.rs:
##########
@@ -18,178 +18,199 @@
 use arrow::array::{StringArray, StringViewArray};
 use arrow::datatypes::{DataType, Field};
 use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
 use rand::distr::Alphanumeric;
 use rand::prelude::StdRng;
 use rand::{Rng, SeedableRng};
 use std::hint::black_box;
-use std::str::Chars;
 use std::sync::Arc;
 
-/// Returns a `Vec<ColumnarValue>` with two elements: a haystack array and a
-/// needle array. Each haystack is a random string of `str_len_chars`
-/// characters. Each needle is a random contiguous substring of its
-/// corresponding haystack (i.e., the needle is always present in the 
haystack).
-/// Around `null_density` fraction of rows are null and `utf8_density` fraction
-/// contain non-ASCII characters; the remaining rows are ASCII-only.
-fn gen_string_array(
-    n_rows: usize,
+#[rustfmt::skip]
+const UTF8_CORPUS: &[char] = &[
+    // Cyrillic (2 bytes each)
+    'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'К', 'Л', 'М', 'Н', 'О', 'П', 
'Р', 'С',
+    'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч', 'Ш', 'Щ', 'Э', 'Ю', 'Я',
+    // CJK (3 bytes each)
+    '数', '据', '融', '合', '查', '询', '引', '擎', '优', '化', '执', '行', '计', '划',
+    '表', '达',
+    // Emoji (4 bytes each)
+    '📊', '🔥', '🚀', '⚡', '🎯', '💡', '🔧', '📈',
+];
+const N_ROWS: usize = 8192;
+
+/// Returns a random string of `len` characters. If `ascii` is true, the string
+/// is ASCII-only; otherwise it is drawn from `UTF8_CORPUS`.
+fn random_string(rng: &mut StdRng, len: usize, ascii: bool) -> String {
+    if ascii {
+        let value: Vec<u8> = 
rng.sample_iter(&Alphanumeric).take(len).collect();
+        String::from_utf8(value).unwrap()
+    } else {
+        (0..len)
+            .map(|_| UTF8_CORPUS[rng.random_range(0..UTF8_CORPUS.len())])
+            .collect()
+    }
+}
+
+/// Wraps `strings` into either a `StringArray` or `StringViewArray`.
+fn to_columnar_value(
+    strings: Vec<Option<String>>,
+    is_string_view: bool,
+) -> ColumnarValue {
+    if is_string_view {
+        let arr: StringViewArray = strings.into_iter().collect();
+        ColumnarValue::Array(Arc::new(arr))
+    } else {
+        let arr: StringArray = strings.into_iter().collect();
+        ColumnarValue::Array(Arc::new(arr))
+    }
+}
+
+/// Returns haystack and needle, where both are arrays. Each needle is a
+/// contiguous substring of its corresponding haystack. Around `null_density`
+/// fraction of rows are null and `utf8_density` fraction contain non-ASCII
+/// characters.
+fn make_array_needle_args(
+    rng: &mut StdRng,
     str_len_chars: usize,
     null_density: f32,
     utf8_density: f32,
-    is_string_view: bool, // false -> StringArray, true -> StringViewArray
+    is_string_view: bool,
 ) -> Vec<ColumnarValue> {
-    let mut rng = StdRng::seed_from_u64(42);
-    let rng_ref = &mut rng;
-
-    let utf8 = "DatafusionДатаФусион数据融合📊🔥"; // includes utf8 encoding with 
1~4 bytes
-    let corpus_char_count = utf8.chars().count();
-
-    let mut output_string_vec: Vec<Option<String>> = 
Vec::with_capacity(n_rows);
-    let mut output_sub_string_vec: Vec<Option<String>> = 
Vec::with_capacity(n_rows);
-    for _ in 0..n_rows {
-        let rand_num = rng_ref.random::<f32>(); // [0.0, 1.0)
-        if rand_num < null_density {
-            output_sub_string_vec.push(None);
-            output_string_vec.push(None);
-        } else if rand_num < null_density + utf8_density {
-            // Generate random UTF8 string
-            let mut generated_string = String::with_capacity(str_len_chars);
-            for _ in 0..str_len_chars {
-                let idx = rng_ref.random_range(0..corpus_char_count);
-                let char = utf8.chars().nth(idx).unwrap();
-                generated_string.push(char);
-            }
-            
output_sub_string_vec.push(Some(random_substring(generated_string.chars())));
-            output_string_vec.push(Some(generated_string));
+    let mut haystacks: Vec<Option<String>> = Vec::with_capacity(N_ROWS);
+    let mut needles: Vec<Option<String>> = Vec::with_capacity(N_ROWS);
+    for _ in 0..N_ROWS {
+        let r = rng.random::<f32>();
+        if r < null_density {
+            haystacks.push(None);
+            needles.push(None);
         } else {
-            // Generate random ASCII-only string
-            let value = rng_ref
+            let ascii = r >= null_density + utf8_density;
+            let s = random_string(rng, str_len_chars, ascii);
+            needles.push(Some(random_substring(rng, &s)));
+            haystacks.push(Some(s));
+        }
+    }
+
+    vec![
+        to_columnar_value(haystacks, is_string_view),
+        to_columnar_value(needles, is_string_view),
+    ]
+}
+
+/// Returns haystack array with a fixed scalar needle inserted into each row.
+/// `utf8_density` fraction of rows contain non-ASCII characters.
+/// The needle must be ASCII.
+fn make_scalar_needle_args(
+    rng: &mut StdRng,
+    str_len_chars: usize,
+    needle: &str,
+    utf8_density: f32,
+    is_string_view: bool,
+) -> Vec<ColumnarValue> {
+    let needle_len = needle.len();
+
+    let mut haystacks: Vec<Option<String>> = Vec::with_capacity(N_ROWS);
+    for _ in 0..N_ROWS {
+        let ascii = rng.random::<f32>() >= utf8_density;
+        if ascii {
+            let mut value: Vec<u8> = (&mut *rng)
                 .sample_iter(&Alphanumeric)
                 .take(str_len_chars)
                 .collect();
-            let value = String::from_utf8(value).unwrap();
-            output_sub_string_vec.push(Some(random_substring(value.chars())));
-            output_string_vec.push(Some(value));
+            if str_len_chars >= needle_len {
+                let pos = rng.random_range(0..=str_len_chars - needle_len);
+                value[pos..pos + 
needle_len].copy_from_slice(needle.as_bytes());
+            }
+            haystacks.push(Some(String::from_utf8(value).unwrap()));

Review Comment:
   Fixed by adding an assert and simplifying the function.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to