yordan-pavlov commented on a change in pull request #8630: URL: https://github.com/apache/arrow/pull/8630#discussion_r521574396
########## File path: rust/arrow/benches/filter_kernels.rs ########## @@ -14,137 +14,136 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. +extern crate arrow; + +use rand::{ + distributions::{Alphanumeric, Standard}, + prelude::Distribution, + Rng, +}; use arrow::array::*; -use arrow::compute::{filter, FilterContext}; +use arrow::compute::{build_filter, filter}; use arrow::datatypes::ArrowNumericType; +use arrow::datatypes::{Float32Type, UInt8Type}; + use criterion::{criterion_group, criterion_main, Criterion}; -fn create_primitive_array<T, F>(size: usize, value_fn: F) -> PrimitiveArray<T> +fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T> where T: ArrowNumericType, - F: Fn(usize) -> T::Native, + Standard: Distribution<T::Native>, { + // use random numbers to avoid spurious compiler optimizations wrt to branching + let mut rng = rand::thread_rng(); let mut builder = PrimitiveArray::<T>::builder(size); - for i in 0..size { - builder.append_value(value_fn(i)).unwrap(); + + for _ in 0..size { + if rng.gen::<f32>() < null_density { + builder.append_null().unwrap(); + } else { + builder.append_value(rng.gen()).unwrap(); + } } builder.finish() } -fn create_u8_array_with_nulls(size: usize) -> UInt8Array { - let mut builder = UInt8Builder::new(size); - for i in 0..size { - if i % 2 == 0 { - builder.append_value(1).unwrap(); - } else { +fn create_string_array(size: usize, null_density: f32) -> StringArray { + // use random numbers to avoid spurious compiler optimizations wrt to branching + let mut rng = rand::thread_rng(); + let mut builder = StringBuilder::new(size); + + for _ in 0..size { + if rng.gen::<f32>() < null_density { builder.append_null().unwrap(); + } else { + let value = rng.sample_iter(&Alphanumeric).take(10).collect::<String>(); + builder.append_value(&value).unwrap(); } } builder.finish() } -fn create_bool_array<F>(size: usize, value_fn: F) -> BooleanArray -where - F: Fn(usize) -> bool, -{ +fn create_bool_array(size: usize, trues_density: f32) -> BooleanArray { + let mut rng = rand::thread_rng(); let mut builder = BooleanBuilder::new(size); - for i in 0..size { - builder.append_value(value_fn(i)).unwrap(); + for _ in 0..size { + let value = rng.gen::<f32>() < trues_density; Review comment: the filter benchmarks may not simulate real-world use cases, but they are designed to test the code under specific conditions such as the worst case scenario with alternating 1s and 0s where no batch can be skipped and all selected values have to be copied individually; how can this scenario be achieved with a randomly generated filter array? the other scenarios which test mostly 0s (best performance because most filter batches can be skipped and only a small number of selected values have to be copied) and mostly 1s (which is not as fast, but still faster than worst case because filter batches can be checked quickly and most values are copied in slices) should be easier to achieve with random filter arrays but are they going to be repeatable? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org