acking-you commented on code in PR #15694: URL: https://github.com/apache/datafusion/pull/15694#discussion_r2047346420
########## datafusion/physical-expr/src/expressions/binary.rs: ########## @@ -811,58 +822,199 @@ impl BinaryExpr { } } +enum ShortCircuitStrategy<'a> { + None, + ReturnLeft, + ReturnRight, + PreSelection(&'a BooleanArray), +} + +/// Based on the results calculated from the left side of the short-circuit operation, +/// if the proportion of `true` is less than 0.2 and the current operation is an `and`, +/// the `RecordBatch` will be filtered in advance. +const PRE_SELECTION_THRESHOLD: f32 = 0.2; + /// Checks if a logical operator (`AND`/`OR`) can short-circuit evaluation based on the left-hand side (lhs) result. /// -/// Short-circuiting occurs when evaluating the right-hand side (rhs) becomes unnecessary: -/// - For `AND`: if ALL values in `lhs` are `false`, the expression must be `false` regardless of rhs. -/// - For `OR`: if ALL values in `lhs` are `true`, the expression must be `true` regardless of rhs. -/// -/// Returns `true` if short-circuiting is possible, `false` otherwise. -/// +/// Short-circuiting occurs under these circumstances: +/// - For `AND`: +/// - if LHS is all false => short-circuit → return LHS +/// - if LHS is all true => short-circuit → return RHS +/// - if LHS is mixed and true_count/sum_count <= [`PRE_SELECTION_THRESHOLD`] -> pre-selection +/// - For `OR`: +/// - if LHS is all true => short-circuit → return LHS +/// - if LHS is all false => short-circuit → return RHS /// # Arguments -/// * `arg` - The left-hand side (lhs) columnar value (array or scalar) +/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar) +/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar) /// * `op` - The logical operator (`AND` or `OR`) /// /// # Implementation Notes /// 1. Only works with Boolean-typed arguments (other types automatically return `false`) /// 2. Handles both scalar values and array values -/// 3. For arrays, uses optimized `true_count()`/`false_count()` methods from arrow-rs. -/// `bool_or`/`bool_and` maybe a better choice too,for detailed discussion,see:[link](https://github.com/apache/datafusion/pull/15462#discussion_r2020558418) -fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool { - let data_type = arg.data_type(); - match (data_type, op) { - (DataType::Boolean, Operator::And) => { - match arg { - ColumnarValue::Array(array) => { - if let Ok(array) = as_boolean_array(&array) { - return array.false_count() == array.len(); - } +/// 3. For arrays, uses optimized bit counting techniques for boolean arrays +fn check_short_circuit<'a>( + lhs: &'a ColumnarValue, + op: &Operator, +) -> ShortCircuitStrategy<'a> { + // Quick reject for non-logical operators,and quick judgment when op is and + let is_and = match op { + Operator::And => true, + Operator::Or => false, + _ => return ShortCircuitStrategy::None, + }; + + // Non-boolean types can't be short-circuited + if lhs.data_type() != DataType::Boolean { + return ShortCircuitStrategy::None; + } + + match lhs { + ColumnarValue::Array(array) => { + // Fast path for arrays - try to downcast to boolean array + if let Ok(bool_array) = as_boolean_array(array) { + // Arrays with nulls can't be short-circuited + if bool_array.null_count() > 0 { + return ShortCircuitStrategy::None; } - ColumnarValue::Scalar(scalar) => { - if let ScalarValue::Boolean(Some(value)) = scalar { - return !value; + + let len = bool_array.len(); + if len == 0 { + return ShortCircuitStrategy::None; + } + + let true_count = bool_array.values().count_set_bits(); + if is_and { + // For AND, prioritize checking for all-false (short circuit case) + // Uses optimized false_count() method provided by Arrow + + // Short circuit if all values are false + if true_count == 0 { + return ShortCircuitStrategy::ReturnLeft; + } + + // If no false values, then all must be true + if true_count == len { + return ShortCircuitStrategy::ReturnRight; + } + + // determine if we can pre-selection + if true_count as f32 / len as f32 <= PRE_SELECTION_THRESHOLD { + return ShortCircuitStrategy::PreSelection(bool_array); + } + } else { + // For OR, prioritize checking for all-true (short circuit case) + // Uses optimized true_count() method provided by Arrow + + // Short circuit if all values are true + if true_count == len { + return ShortCircuitStrategy::ReturnLeft; + } + + // If no true values, then all must be false + if true_count == 0 { + return ShortCircuitStrategy::ReturnRight; } } } - false } - (DataType::Boolean, Operator::Or) => { - match arg { - ColumnarValue::Array(array) => { - if let Ok(array) = as_boolean_array(&array) { - return array.true_count() == array.len(); - } - } - ColumnarValue::Scalar(scalar) => { - if let ScalarValue::Boolean(Some(value)) = scalar { - return *value; - } + ColumnarValue::Scalar(scalar) => { + // Fast path for scalar values + if let ScalarValue::Boolean(Some(is_true)) = scalar { + // Return Left for: + // - AND with false value + // - OR with true value + if (is_and && !is_true) || (!is_and && *is_true) { + return ShortCircuitStrategy::ReturnLeft; + } else { + return ShortCircuitStrategy::ReturnRight; } } - false } - _ => false, } + + // If we can't short-circuit, indicate that normal evaluation should continue + ShortCircuitStrategy::None +} + +/// Creates a new boolean array based on the evaluation of the right expression, +/// but only for positions where the left_result is true. +/// +/// This function is used for short-circuit evaluation optimization of logical AND operations: +/// - When left_result has few true values, we only evaluate the right expression for those positions +/// - Values are copied from right_array where left_result is true +/// - All other positions are filled with false values +/// +/// # Parameters +/// - `left_result` Boolean array with selection mask (typically from left side of AND) +/// - `right_result` Result of evaluating right side of expression (only for selected positions) +/// +/// # Returns +/// A combined ColumnarValue with values from right_result where left_result is true +/// +/// # Example +/// Initial Data: { 1, 2, 3, 4, 5 } +/// Left Evaluation +/// (Condition: Equal to 2 or 3) +/// ↓ +/// Filtered Data: {2, 3} +/// Left Bitmap: { 0, 1, 1, 0, 0 } +/// ↓ +/// Right Evaluation +/// (Condition: Even numbers) +/// ↓ +/// Right Data: { 2 } +/// Right Bitmap: { 1, 0 } +/// ↓ +/// Combine Results +/// Final Bitmap: { 0, 1, 0, 0, 0 } +/// +/// # Note +/// Perhaps it would be better to modify `left_result` directly without creating a copy? +/// In practice, `left_result` should have only one owner, so making changes should be safe. +/// However, this is difficult to achieve under the immutable constraints of [`Arc`] and [`BooleanArray`]. +fn pre_selection_scatter( Review Comment: done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org