pepijnve commented on code in PR #8753:
URL: https://github.com/apache/arrow-rs/pull/8753#discussion_r2511545389


##########
arrow/benches/merge_kernels.rs:
##########
@@ -0,0 +1,280 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use criterion::measurement::WallTime;
+use criterion::{BenchmarkGroup, BenchmarkId, Criterion, criterion_group, 
criterion_main};
+use rand::distr::{Distribution, StandardUniform};
+use rand::prelude::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint;
+use std::sync::Arc;
+
+use arrow::array::*;
+use arrow::datatypes::*;
+use arrow::util::bench_util::*;
+use arrow_select::merge::merge;
+
+trait InputGenerator {
+    fn name(&self) -> &str;
+
+    /// Return an ArrayRef containing a single null value
+    fn generate_scalar_with_null_value(&self) -> ArrayRef;
+
+    /// Generate a `number_of_scalars` unique scalars
+    fn generate_non_null_scalars(&self, seed: u64, number_of_scalars: usize) 
-> Vec<ArrayRef>;
+
+    /// Generate an array with the specified length and null percentage
+    fn generate_array(&self, seed: u64, array_length: usize, null_percentage: 
f32) -> ArrayRef;
+}
+
+struct GeneratePrimitive<T: ArrowPrimitiveType> {
+    description: String,
+    _marker: std::marker::PhantomData<T>,
+}
+
+impl<T> InputGenerator for GeneratePrimitive<T>
+where
+    T: ArrowPrimitiveType,
+    StandardUniform: Distribution<T::Native>,
+{
+    fn name(&self) -> &str {
+        self.description.as_str()
+    }
+
+    fn generate_scalar_with_null_value(&self) -> ArrayRef {
+        new_null_array(&T::DATA_TYPE, 1)
+    }
+
+    fn generate_non_null_scalars(&self, seed: u64, number_of_scalars: usize) 
-> Vec<ArrayRef> {
+        let rng = StdRng::seed_from_u64(seed);
+
+        rng.sample_iter::<T::Native, _>(StandardUniform)
+            .take(number_of_scalars)
+            .map(|v: T::Native| {
+                Arc::new(PrimitiveArray::<T>::new_scalar(v).into_inner()) as 
ArrayRef
+            })
+            .collect()
+    }
+
+    fn generate_array(&self, seed: u64, array_length: usize, null_percentage: 
f32) -> ArrayRef {
+        Arc::new(create_primitive_array_with_seed::<T>(
+            array_length,
+            null_percentage,
+            seed,
+        ))
+    }
+}
+
+struct GenerateBytes<Byte: ByteArrayType> {
+    range_length: std::ops::Range<usize>,
+    description: String,
+
+    _marker: std::marker::PhantomData<Byte>,
+}
+
+impl<Byte> InputGenerator for GenerateBytes<Byte>
+where
+    Byte: ByteArrayType,
+{
+    fn name(&self) -> &str {
+        self.description.as_str()
+    }
+
+    fn generate_scalar_with_null_value(&self) -> ArrayRef {
+        new_null_array(&Byte::DATA_TYPE, 1)
+    }
+
+    fn generate_non_null_scalars(&self, seed: u64, number_of_scalars: usize) 
-> Vec<ArrayRef> {
+        let array = self.generate_array(seed, number_of_scalars, 0.0);
+
+        (0..number_of_scalars).map(|i| array.slice(i, 1)).collect()
+    }
+
+    fn generate_array(&self, seed: u64, array_length: usize, null_percentage: 
f32) -> ArrayRef {
+        let is_binary =
+            Byte::DATA_TYPE == DataType::Binary || Byte::DATA_TYPE == 
DataType::LargeBinary;
+        if is_binary {
+            Arc::new(create_binary_array_with_len_range_and_prefix_and_seed::<
+                Byte::Offset,
+            >(
+                array_length,
+                null_percentage,
+                self.range_length.start,
+                self.range_length.end - 1,
+                &[],
+                seed,
+            ))
+        } else {
+            Arc::new(create_string_array_with_len_range_and_prefix_and_seed::<
+                Byte::Offset,
+            >(
+                array_length,
+                null_percentage,
+                self.range_length.start,
+                self.range_length.end - 1,
+                "",
+                seed,
+            ))
+        }
+    }
+}
+
+fn mask_cases(len: usize) -> Vec<(&'static str, BooleanArray)> {
+    vec![
+        ("all_true", create_boolean_array(len, 0.0, 1.0)),
+        ("99pct_true", create_boolean_array(len, 0.0, 0.99)),
+        ("90pct_true", create_boolean_array(len, 0.0, 0.9)),
+        ("50pct_true", create_boolean_array(len, 0.0, 0.5)),
+        ("10pct_true", create_boolean_array(len, 0.0, 0.1)),
+        ("1pct_true", create_boolean_array(len, 0.0, 0.01)),
+        ("all_false", create_boolean_array(len, 0.0, 0.0)),
+        ("50pct_nulls", create_boolean_array(len, 0.5, 0.5)),
+    ]
+}
+
+fn bench_merge_on_input_generator(c: &mut Criterion, input_generator: &impl 
InputGenerator) {
+    const ARRAY_LEN: usize = 8192;
+
+    let mut group =
+        c.benchmark_group(format!("merge_{ARRAY_LEN}_from_{}", 
input_generator.name()).as_str());
+
+    let null_scalar = input_generator.generate_scalar_with_null_value();
+    let [non_null_scalar_1, non_null_scalar_2]: [_; 2] = input_generator
+        .generate_non_null_scalars(42, 2)
+        .try_into()
+        .unwrap();
+
+    // For simplicity, we generate arrays with length ARRAY_LEN. Not all input 
values will be used.
+    let array_1_10pct_nulls = input_generator.generate_array(42, ARRAY_LEN, 
0.1);
+    let array_2_10pct_nulls = input_generator.generate_array(18, ARRAY_LEN, 
0.1);
+
+    let masks = mask_cases(ARRAY_LEN);
+
+    // Benchmarks for different scalar combinations
+    for (description, truthy, falsy) in &[
+        ("null_vs_non_null_scalar", &null_scalar, &non_null_scalar_1),
+        (
+            "non_null_scalar_vs_null_scalar",
+            &non_null_scalar_1,
+            &null_scalar,
+        ),
+        ("non_nulls_scalars", &non_null_scalar_1, &non_null_scalar_2),
+    ] {
+        bench_merge_input_on_all_masks(
+            description,
+            &mut group,
+            &masks,
+            &Scalar::new(truthy),
+            &Scalar::new(falsy),
+        );
+    }
+
+    bench_merge_input_on_all_masks(
+        "array_vs_non_null_scalar",
+        &mut group,
+        &masks,
+        &array_1_10pct_nulls,
+        &non_null_scalar_1,
+    );
+
+    bench_merge_input_on_all_masks(
+        "non_null_scalar_vs_array",
+        &mut group,
+        &masks,
+        &array_1_10pct_nulls,
+        &non_null_scalar_1,

Review Comment:
   Indeed. I had copied these from `zip_kernel.rs` which has the same mistake. 
Fixing here and in `zip_kernel.rs`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to