zhuqi-lucas commented on code in PR #7470:
URL: https://github.com/apache/arrow-rs/pull/7470#discussion_r2083498475


##########
parquet/benches/arrow_reader_clickbench.rs:
##########
@@ -0,0 +1,880 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmark for evaluating row filters and projections using the 
[ClickBench] queries and data.
+//!
+//! While the actual ClickBench queries often also include some sort of 
aggregation
+//! or limit, this benchmark measures the raw speed of applying filtering
+//! and projections, and optimize the performance of the `parquet` filter
+//! evaluation in real world scenarios.
+//!
+//! This benchmark uses the hits_1.parquet file, is a 100,000 row samples of
+//! the entire 100M row dataset. It is reasonable in size and speed to run
+//! and seems to be a good representative of the entire dataset.
+//!
+//! See also `arrow_reader_row_filter` for more focused filter evaluation 
microbenchmarks
+//!
+//! [ClickBench]: https://benchmark.clickhouse.com/
+
+use arrow::compute::kernels::cmp::{eq, neq};
+use arrow::compute::{like, nlike, or};
+use arrow_array::types::{Int16Type, Int32Type, Int64Type};
+use arrow_array::{ArrayRef, ArrowPrimitiveType, BooleanArray, PrimitiveArray, 
StringViewArray};
+use arrow_schema::{ArrowError, DataType, Schema};
+use criterion::{criterion_group, criterion_main, Criterion};
+use futures::StreamExt;
+use parquet::arrow::arrow_reader::{
+    ArrowPredicate, ArrowPredicateFn, ArrowReaderMetadata, ArrowReaderOptions,
+    ParquetRecordBatchReaderBuilder, RowFilter,
+};
+use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask};
+use parquet::schema::types::SchemaDescriptor;
+use std::fmt::{Display, Formatter};
+use std::path::{Path, PathBuf};
+use std::sync::{Arc, OnceLock};
+
+fn async_reader(c: &mut Criterion) {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let mut async_group = c.benchmark_group("arrow_reader_clickbench/async");
+    let handle = rt.handle();
+    for query in all_queries() {
+        let query_name = query.to_string();
+        let read_test = ReadTest::new(query);
+        async_group.bench_function(query_name, |b| {
+            b.iter(|| handle.block_on(async { read_test.run_async().await }))
+        });
+    }
+}
+
+fn sync_reader(c: &mut Criterion) {
+    let mut sync_group = c.benchmark_group("arrow_reader_clickbench/sync");
+    for query in all_queries() {
+        let query_name = query.to_string();
+        let read_test = ReadTest::new(query);
+        sync_group.bench_function(query_name, |b| b.iter(|| 
read_test.run_sync()));
+    }
+}
+
+criterion_group!(benches, sync_reader, async_reader);
+criterion_main!(benches);
+
+/// Predicate Function.
+///
+/// Functions are invoked with the requested array and return a 
[`BooleanArray`]
+/// as described in [`ArrowPredicate::evaluate`].
+type ColumnPredicateFn =
+    dyn FnMut(&ArrayRef) -> Result<BooleanArray, ArrowError> + Send + Sync + 
'static;
+
+/// ClickBench query pattern: a particular set of filter and projections used 
in the
+/// [ClickBench queries] when run in [Apache DataFusion].
+///
+/// [ClickBench queries]: 
https://github.com/apache/datafusion/blob/main/benchmarks/queries/clickbench/queries.sql
+/// [Apache DataFusion]: https://datafusion.apache.org/
+struct Query {

Review Comment:
   Very good! Thank you @alamb 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to