adriangb commented on code in PR #20160: URL: https://github.com/apache/datafusion/pull/20160#discussion_r2768613927
########## datafusion/physical-expr/src/expressions/adaptive_selectivity_filter.rs: ########## @@ -0,0 +1,486 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A wrapper [`PhysicalExpr`] that tracks filter selectivity at runtime and +//! automatically disables filters that aren't pruning enough rows. + +use std::any::Any; +use std::fmt::Display; +use std::hash::Hash; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; + +use arrow::array::{ArrayRef, BooleanArray}; +use arrow::datatypes::{DataType, Schema}; +use arrow::record_batch::RecordBatch; +use datafusion_common::Result; +use datafusion_expr::ColumnarValue; +use datafusion_physical_expr_common::physical_expr::DynHash; +use parking_lot::RwLock; + +use crate::PhysicalExpr; + +/// Configuration for selectivity-based filter disabling. +#[derive(Debug, Clone)] +pub struct SelectivityConfig { + /// Threshold above which the filter is disabled (e.g., 0.95 = 95% selectivity). + /// If the filter passes this fraction or more of rows, it will be disabled. + pub threshold: f64, Review Comment: Could be in GB/s? Rows/s? ########## datafusion/physical-expr/src/expressions/adaptive_selectivity_filter.rs: ########## @@ -0,0 +1,486 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A wrapper [`PhysicalExpr`] that tracks filter selectivity at runtime and +//! automatically disables filters that aren't pruning enough rows. + +use std::any::Any; +use std::fmt::Display; +use std::hash::Hash; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; + +use arrow::array::{ArrayRef, BooleanArray}; +use arrow::datatypes::{DataType, Schema}; +use arrow::record_batch::RecordBatch; +use datafusion_common::Result; +use datafusion_expr::ColumnarValue; +use datafusion_physical_expr_common::physical_expr::DynHash; +use parking_lot::RwLock; + +use crate::PhysicalExpr; + +/// Configuration for selectivity-based filter disabling. +#[derive(Debug, Clone)] +pub struct SelectivityConfig { + /// Threshold above which the filter is disabled (e.g., 0.95 = 95% selectivity). + /// If the filter passes this fraction or more of rows, it will be disabled. + pub threshold: f64, + /// Minimum rows to process before making a selectivity decision. + pub min_rows: usize, Review Comment: Could be in GB? Number of batches? Time? (or all of the above?) ########## datafusion/common/src/config.rs: ########## @@ -1115,6 +1115,34 @@ config_namespace! { /// See: <https://trino.io/docs/current/admin/dynamic-filtering.html#dynamic-filter-collection-thresholds> pub hash_join_inlist_pushdown_max_distinct_values: usize, default = 150 + /// Enable selectivity-based disabling of dynamic filters from joins. + /// + /// When enabled, join dynamic filters that pass most rows (above the threshold) + /// will be automatically disabled to avoid evaluation overhead. This is useful + /// when the build side of a join covers most of the probe side values, making + /// the filter expensive to evaluate for little benefit. + /// + /// The selectivity tracking resets when the dynamic filter is updated (e.g., when + /// the hash table is built), allowing the filter to be re-evaluated with new data. + pub enable_dynamic_filter_selectivity_tracking: bool, default = false + + /// Selectivity threshold for disabling join dynamic filters. + /// + /// If the filter passes this fraction or more of rows, it will be disabled. + /// Value should be between 0.0 and 1.0. + /// + /// For example, 0.95 means if 95% or more of rows pass the filter, it will be disabled. + /// Only used when `enable_dynamic_filter_selectivity_tracking` is true. + pub dynamic_filter_selectivity_threshold: f64, default = 0.95 + + /// Minimum number of rows to process before making a selectivity decision + /// for join dynamic filters. + /// + /// The filter will remain in a tracking state until this many rows have been + /// processed. This ensures statistical stability before making the disable decision. + /// Only used when `enable_dynamic_filter_selectivity_tracking` is true. + pub dynamic_filter_min_rows_for_selectivity: usize, default = 10_000 Review Comment: Not sure we need all of these, or at least not sure they should be prefixed with `dynamic_filter` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
