yordan-pavlov commented on a change in pull request #8630:
URL: https://github.com/apache/arrow/pull/8630#discussion_r521574396
##########
File path: rust/arrow/benches/filter_kernels.rs
##########
@@ -14,137 +14,136 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
+extern crate arrow;
+
+use rand::{
+ distributions::{Alphanumeric, Standard},
+ prelude::Distribution,
+ Rng,
+};
use arrow::array::*;
-use arrow::compute::{filter, FilterContext};
+use arrow::compute::{build_filter, filter};
use arrow::datatypes::ArrowNumericType;
+use arrow::datatypes::{Float32Type, UInt8Type};
+
use criterion::{criterion_group, criterion_main, Criterion};
-fn create_primitive_array<T, F>(size: usize, value_fn: F) -> PrimitiveArray<T>
+fn create_primitive_array<T>(size: usize, null_density: f32) ->
PrimitiveArray<T>
where
T: ArrowNumericType,
- F: Fn(usize) -> T::Native,
+ Standard: Distribution<T::Native>,
{
+ // use random numbers to avoid spurious compiler optimizations wrt to
branching
+ let mut rng = rand::thread_rng();
let mut builder = PrimitiveArray::<T>::builder(size);
- for i in 0..size {
- builder.append_value(value_fn(i)).unwrap();
+
+ for _ in 0..size {
+ if rng.gen::<f32>() < null_density {
+ builder.append_null().unwrap();
+ } else {
+ builder.append_value(rng.gen()).unwrap();
+ }
}
builder.finish()
}
-fn create_u8_array_with_nulls(size: usize) -> UInt8Array {
- let mut builder = UInt8Builder::new(size);
- for i in 0..size {
- if i % 2 == 0 {
- builder.append_value(1).unwrap();
- } else {
+fn create_string_array(size: usize, null_density: f32) -> StringArray {
+ // use random numbers to avoid spurious compiler optimizations wrt to
branching
+ let mut rng = rand::thread_rng();
+ let mut builder = StringBuilder::new(size);
+
+ for _ in 0..size {
+ if rng.gen::<f32>() < null_density {
builder.append_null().unwrap();
+ } else {
+ let value =
rng.sample_iter(&Alphanumeric).take(10).collect::<String>();
+ builder.append_value(&value).unwrap();
}
}
builder.finish()
}
-fn create_bool_array<F>(size: usize, value_fn: F) -> BooleanArray
-where
- F: Fn(usize) -> bool,
-{
+fn create_bool_array(size: usize, trues_density: f32) -> BooleanArray {
+ let mut rng = rand::thread_rng();
let mut builder = BooleanBuilder::new(size);
- for i in 0..size {
- builder.append_value(value_fn(i)).unwrap();
+ for _ in 0..size {
+ let value = rng.gen::<f32>() < trues_density;
Review comment:
the filter benchmarks may not simulate real-world use cases, but they
are designed to test the code under specific conditions such as the worst case
scenario with alternating 1s and 0s where no batch can be skipped and all
selected values have to be copied individually; how can this scenario be
achieved with a randomly generated filter array?
the other scenarios which test mostly 0s (best performance because most
filter batches can be skipped and only a small number of selected values have
to be copied) and mostly 1s (which is not as fast, but still faster than worst
case because filter batches can be checked quickly and most values are copied
in slices) should be easier to achieve with random filter arrays but are they
going to be repeatable?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]