kosiew commented on code in PR #22143:
URL: https://github.com/apache/datafusion/pull/22143#discussion_r3239662396
##########
datafusion/functions-nested/benches/array_expression.rs:
##########
@@ -15,33 +15,143 @@
// specific language governing permissions and limitations
// under the License.
-use criterion::{Criterion, criterion_group, criterion_main};
-use datafusion_expr::lit;
-use datafusion_functions_nested::expr_fn::{array_replace_all, make_array};
+use arrow::array::{ArrayRef, Int64Builder, ListArray};
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+ BenchmarkGroup, Criterion, criterion_group, criterion_main,
measurement::WallTime,
+};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDF};
+use datafusion_functions_nested::replace::{
+ array_replace_all_udf, array_replace_n_udf, array_replace_udf,
+};
+use rand::seq::IndexedRandom;
+use rand::{Rng, SeedableRng, rngs::StdRng};
use std::hint::black_box;
+use std::sync::Arc;
+
+// (num_rows, list_size) — tuned so benches finish in a few seconds. At
+// NEEDLE_DENSITY = 0.1, list_size = 100/500 reliably exercise the
+// match-and-replace path (and the counter cutoff for `array_replace_n`);
+// list_size = 10 exercises the null-row and no-match early-return paths.
+const SIZES: &[(usize, usize)] = &[(4_000, 10), (10_000, 100), (10_000, 500)];
+const SEED: u64 = 42;
+const HAYSTACK_NULL_DENSITY: f64 = 0.1;
+const NEEDLE_DENSITY: f64 = 0.1;
+const NEEDLE: i64 = 0;
+const REPLACEMENT: i64 = -1;
fn criterion_benchmark(c: &mut Criterion) {
- // Construct large arrays for benchmarking
+ let list_data_type =
+ DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true)));
+ let return_field: Arc<Field> =
+ Field::new("result", list_data_type.clone(), true).into();
+ let config_options = Arc::new(ConfigOptions::default());
- let array_len = 100_000;
+ let array_field: Arc<Field> = Field::new("array", list_data_type,
true).into();
+ let from_field: Arc<Field> = Field::new("from", DataType::Int64,
true).into();
+ let to_field: Arc<Field> = Field::new("to", DataType::Int64, true).into();
+ let max_field: Arc<Field> = Field::new("max", DataType::Int64,
true).into();
- let array = (0..array_len).map(|_| lit(2_i64)).collect::<Vec<_>>();
- let list_array = make_array(vec![make_array(array); 3]);
- let from_array = make_array(vec![lit(2_i64); 3]);
- let to_array = make_array(vec![lit(-2_i64); 3]);
+ let three_arg_fields =
+ vec![array_field.clone(), from_field.clone(), to_field.clone()];
+ let four_arg_fields = vec![array_field, from_field, to_field, max_field];
- // Benchmark array functions
+ let mut group = c.benchmark_group("array_replace_int64");
Review Comment:
Nice addition. One thing I noticed is that
`datafusion/functions-nested/benches/array_replace.rs` already contains a
dedicated `array_replace` benchmark covering `array_replace_udf`,
`array_replace_n_udf`, and `array_replace_all_udf` across the same int64 sizes,
along with nested, string, bool, and fixed-size-binary cases.
It might be worth consolidating this into the dedicated `array_replace.rs`
benchmark, or removing the now somewhat misleading `array_expression` benchmark
target. That would help avoid duplicate benchmark maintenance and keep
ownership clearer.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]