alamb commented on code in PR #15694:
URL: https://github.com/apache/datafusion/pull/15694#discussion_r2045649141


##########
datafusion/physical-expr/src/expressions/binary.rs:
##########
@@ -811,58 +822,199 @@ impl BinaryExpr {
     }
 }
 
+enum ShortCircuitStrategy<'a> {
+    None,
+    ReturnLeft,
+    ReturnRight,
+    PreSelection(&'a BooleanArray),
+}
+
+/// Based on the results calculated from the left side of the short-circuit 
operation,
+/// if the proportion of `true` is less than 0.2 and the current operation is 
an `and`,
+/// the `RecordBatch` will be filtered in advance.

Review Comment:
   this seems reasonable to me



##########
datafusion/physical-expr/src/expressions/binary.rs:
##########
@@ -811,58 +822,199 @@ impl BinaryExpr {
     }
 }
 
+enum ShortCircuitStrategy<'a> {
+    None,
+    ReturnLeft,
+    ReturnRight,
+    PreSelection(&'a BooleanArray),
+}
+
+/// Based on the results calculated from the left side of the short-circuit 
operation,
+/// if the proportion of `true` is less than 0.2 and the current operation is 
an `and`,
+/// the `RecordBatch` will be filtered in advance.
+const PRE_SELECTION_THRESHOLD: f32 = 0.2;
+
 /// Checks if a logical operator (`AND`/`OR`) can short-circuit evaluation 
based on the left-hand side (lhs) result.
 ///
-/// Short-circuiting occurs when evaluating the right-hand side (rhs) becomes 
unnecessary:
-/// - For `AND`: if ALL values in `lhs` are `false`, the expression must be 
`false` regardless of rhs.
-/// - For `OR`: if ALL values in `lhs` are `true`, the expression must be 
`true` regardless of rhs.
-///
-/// Returns `true` if short-circuiting is possible, `false` otherwise.
-///
+/// Short-circuiting occurs under these circumstances:
+/// - For `AND`:
+///    - if LHS is all false => short-circuit → return LHS
+///    - if LHS is all true  => short-circuit → return RHS
+///    - if LHS is mixed and true_count/sum_count <= 
[`PRE_SELECTION_THRESHOLD`] -> pre-selection
+/// - For `OR`:
+///    - if LHS is all true  => short-circuit → return LHS
+///    - if LHS is all false => short-circuit → return RHS
 /// # Arguments
-/// * `arg` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
 /// * `op` - The logical operator (`AND` or `OR`)
 ///
 /// # Implementation Notes
 /// 1. Only works with Boolean-typed arguments (other types automatically 
return `false`)
 /// 2. Handles both scalar values and array values
-/// 3. For arrays, uses optimized `true_count()`/`false_count()` methods from 
arrow-rs.
-///    `bool_or`/`bool_and` maybe a better choice too,for detailed 
discussion,see:[link](https://github.com/apache/datafusion/pull/15462#discussion_r2020558418)
-fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool {
-    let data_type = arg.data_type();
-    match (data_type, op) {
-        (DataType::Boolean, Operator::And) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.false_count() == array.len();
-                    }
+/// 3. For arrays, uses optimized bit counting techniques for boolean arrays
+fn check_short_circuit<'a>(
+    lhs: &'a ColumnarValue,
+    op: &Operator,
+) -> ShortCircuitStrategy<'a> {
+    // Quick reject for non-logical operators,and quick judgment when op is and
+    let is_and = match op {
+        Operator::And => true,
+        Operator::Or => false,
+        _ => return ShortCircuitStrategy::None,
+    };
+
+    // Non-boolean types can't be short-circuited
+    if lhs.data_type() != DataType::Boolean {
+        return ShortCircuitStrategy::None;
+    }
+
+    match lhs {
+        ColumnarValue::Array(array) => {
+            // Fast path for arrays - try to downcast to boolean array
+            if let Ok(bool_array) = as_boolean_array(array) {
+                // Arrays with nulls can't be short-circuited
+                if bool_array.null_count() > 0 {
+                    return ShortCircuitStrategy::None;
                 }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return !value;
+
+                let len = bool_array.len();
+                if len == 0 {
+                    return ShortCircuitStrategy::None;
+                }
+
+                let true_count = bool_array.values().count_set_bits();
+                if is_and {
+                    // For AND, prioritize checking for all-false (short 
circuit case)
+                    // Uses optimized false_count() method provided by Arrow
+
+                    // Short circuit if all values are false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no false values, then all must be true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnRight;
+                    }
+
+                    // determine if we can pre-selection
+                    if true_count as f32 / len as f32 <= 
PRE_SELECTION_THRESHOLD {
+                        return ShortCircuitStrategy::PreSelection(bool_array);
+                    }
+                } else {
+                    // For OR, prioritize checking for all-true (short circuit 
case)
+                    // Uses optimized true_count() method provided by Arrow
+
+                    // Short circuit if all values are true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no true values, then all must be false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnRight;
                     }
                 }
             }
-            false
         }
-        (DataType::Boolean, Operator::Or) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.true_count() == array.len();
-                    }
-                }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return *value;
-                    }
+        ColumnarValue::Scalar(scalar) => {
+            // Fast path for scalar values
+            if let ScalarValue::Boolean(Some(is_true)) = scalar {
+                // Return Left for:
+                // - AND with false value
+                // - OR with true value
+                if (is_and && !is_true) || (!is_and && *is_true) {
+                    return ShortCircuitStrategy::ReturnLeft;
+                } else {
+                    return ShortCircuitStrategy::ReturnRight;
                 }
             }
-            false
         }
-        _ => false,
     }
+
+    // If we can't short-circuit, indicate that normal evaluation should 
continue
+    ShortCircuitStrategy::None
+}
+
+/// Creates a new boolean array based on the evaluation of the right 
expression,
+/// but only for positions where the left_result is true.
+///
+/// This function is used for short-circuit evaluation optimization of logical 
AND operations:
+/// - When left_result has few true values, we only evaluate the right 
expression for those positions
+/// - Values are copied from right_array where left_result is true
+/// - All other positions are filled with false values
+///
+/// # Parameters
+/// - `left_result` Boolean array with selection mask (typically from left 
side of AND)
+/// - `right_result` Result of evaluating right side of expression (only for 
selected positions)
+///
+/// # Returns
+/// A combined ColumnarValue with values from right_result where left_result 
is true
+///
+/// # Example
+///  Initial Data: { 1, 2, 3, 4, 5 }
+///  Left Evaluation
+///     (Condition: Equal to 2 or 3)
+///          ↓
+///  Filtered Data: {2, 3}
+///    Left Bitmap: { 0, 1, 1, 0, 0 }
+///          ↓
+///   Right Evaluation
+///     (Condition: Even numbers)
+///          ↓
+///  Right Data: { 2 }
+///    Right Bitmap: { 1, 0 }
+///          ↓
+///   Combine Results
+///  Final Bitmap: { 0, 1, 0, 0, 0 }
+///
+/// # Note
+/// Perhaps it would be better to modify `left_result` directly without 
creating a copy?
+/// In practice, `left_result` should have only one owner, so making changes 
should be safe.
+/// However, this is difficult to achieve under the immutable constraints of 
[`Arc`] and [`BooleanArray`].
+fn pre_selection_scatter(

Review Comment:
   Can you please add some sort of additional coverage for this function -- 
ideally a fuzz test perhaps to ccover the edge cases?



##########
datafusion/physical-expr/src/expressions/binary.rs:
##########
@@ -811,58 +822,199 @@ impl BinaryExpr {
     }
 }
 
+enum ShortCircuitStrategy<'a> {
+    None,
+    ReturnLeft,
+    ReturnRight,
+    PreSelection(&'a BooleanArray),
+}
+
+/// Based on the results calculated from the left side of the short-circuit 
operation,
+/// if the proportion of `true` is less than 0.2 and the current operation is 
an `and`,
+/// the `RecordBatch` will be filtered in advance.
+const PRE_SELECTION_THRESHOLD: f32 = 0.2;
+
 /// Checks if a logical operator (`AND`/`OR`) can short-circuit evaluation 
based on the left-hand side (lhs) result.
 ///
-/// Short-circuiting occurs when evaluating the right-hand side (rhs) becomes 
unnecessary:
-/// - For `AND`: if ALL values in `lhs` are `false`, the expression must be 
`false` regardless of rhs.
-/// - For `OR`: if ALL values in `lhs` are `true`, the expression must be 
`true` regardless of rhs.
-///
-/// Returns `true` if short-circuiting is possible, `false` otherwise.
-///
+/// Short-circuiting occurs under these circumstances:
+/// - For `AND`:
+///    - if LHS is all false => short-circuit → return LHS
+///    - if LHS is all true  => short-circuit → return RHS
+///    - if LHS is mixed and true_count/sum_count <= 
[`PRE_SELECTION_THRESHOLD`] -> pre-selection
+/// - For `OR`:
+///    - if LHS is all true  => short-circuit → return LHS
+///    - if LHS is all false => short-circuit → return RHS
 /// # Arguments
-/// * `arg` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
 /// * `op` - The logical operator (`AND` or `OR`)
 ///
 /// # Implementation Notes
 /// 1. Only works with Boolean-typed arguments (other types automatically 
return `false`)
 /// 2. Handles both scalar values and array values
-/// 3. For arrays, uses optimized `true_count()`/`false_count()` methods from 
arrow-rs.
-///    `bool_or`/`bool_and` maybe a better choice too,for detailed 
discussion,see:[link](https://github.com/apache/datafusion/pull/15462#discussion_r2020558418)
-fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool {
-    let data_type = arg.data_type();
-    match (data_type, op) {
-        (DataType::Boolean, Operator::And) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.false_count() == array.len();
-                    }
+/// 3. For arrays, uses optimized bit counting techniques for boolean arrays
+fn check_short_circuit<'a>(
+    lhs: &'a ColumnarValue,
+    op: &Operator,
+) -> ShortCircuitStrategy<'a> {
+    // Quick reject for non-logical operators,and quick judgment when op is and
+    let is_and = match op {
+        Operator::And => true,
+        Operator::Or => false,
+        _ => return ShortCircuitStrategy::None,
+    };
+
+    // Non-boolean types can't be short-circuited
+    if lhs.data_type() != DataType::Boolean {
+        return ShortCircuitStrategy::None;
+    }
+
+    match lhs {
+        ColumnarValue::Array(array) => {
+            // Fast path for arrays - try to downcast to boolean array
+            if let Ok(bool_array) = as_boolean_array(array) {
+                // Arrays with nulls can't be short-circuited
+                if bool_array.null_count() > 0 {
+                    return ShortCircuitStrategy::None;
                 }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return !value;
+
+                let len = bool_array.len();
+                if len == 0 {
+                    return ShortCircuitStrategy::None;
+                }
+
+                let true_count = bool_array.values().count_set_bits();
+                if is_and {
+                    // For AND, prioritize checking for all-false (short 
circuit case)
+                    // Uses optimized false_count() method provided by Arrow
+
+                    // Short circuit if all values are false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no false values, then all must be true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnRight;
+                    }
+
+                    // determine if we can pre-selection
+                    if true_count as f32 / len as f32 <= 
PRE_SELECTION_THRESHOLD {
+                        return ShortCircuitStrategy::PreSelection(bool_array);
+                    }
+                } else {
+                    // For OR, prioritize checking for all-true (short circuit 
case)
+                    // Uses optimized true_count() method provided by Arrow
+
+                    // Short circuit if all values are true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no true values, then all must be false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnRight;
                     }
                 }
             }
-            false
         }
-        (DataType::Boolean, Operator::Or) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.true_count() == array.len();
-                    }
-                }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return *value;
-                    }
+        ColumnarValue::Scalar(scalar) => {
+            // Fast path for scalar values
+            if let ScalarValue::Boolean(Some(is_true)) = scalar {
+                // Return Left for:
+                // - AND with false value
+                // - OR with true value
+                if (is_and && !is_true) || (!is_and && *is_true) {
+                    return ShortCircuitStrategy::ReturnLeft;
+                } else {
+                    return ShortCircuitStrategy::ReturnRight;
                 }
             }
-            false
         }
-        _ => false,
     }
+
+    // If we can't short-circuit, indicate that normal evaluation should 
continue
+    ShortCircuitStrategy::None
+}
+
+/// Creates a new boolean array based on the evaluation of the right 
expression,
+/// but only for positions where the left_result is true.
+///
+/// This function is used for short-circuit evaluation optimization of logical 
AND operations:
+/// - When left_result has few true values, we only evaluate the right 
expression for those positions
+/// - Values are copied from right_array where left_result is true
+/// - All other positions are filled with false values
+///
+/// # Parameters
+/// - `left_result` Boolean array with selection mask (typically from left 
side of AND)
+/// - `right_result` Result of evaluating right side of expression (only for 
selected positions)
+///
+/// # Returns
+/// A combined ColumnarValue with values from right_result where left_result 
is true
+///
+/// # Example
+///  Initial Data: { 1, 2, 3, 4, 5 }
+///  Left Evaluation
+///     (Condition: Equal to 2 or 3)
+///          ↓
+///  Filtered Data: {2, 3}
+///    Left Bitmap: { 0, 1, 1, 0, 0 }
+///          ↓
+///   Right Evaluation
+///     (Condition: Even numbers)
+///          ↓
+///  Right Data: { 2 }
+///    Right Bitmap: { 1, 0 }
+///          ↓
+///   Combine Results
+///  Final Bitmap: { 0, 1, 0, 0, 0 }
+///
+/// # Note
+/// Perhaps it would be better to modify `left_result` directly without 
creating a copy?
+/// In practice, `left_result` should have only one owner, so making changes 
should be safe.
+/// However, this is difficult to achieve under the immutable constraints of 
[`Arc`] and [`BooleanArray`].

Review Comment:
   I think we could potentially use kernels like `unary_mut` to reuse the 
allocation: https://docs.rs/arrow/latest/arrow/compute/fn.unary_mut.html
   
   However, I don't think that is currently supported in arrow-rs for 
`BooleanArray` -- we could add it upstream if we wanted to explore this 
optimization
   
   However each BooleanArray is likely to have 8192 elements , so 1K bytes, 
where maybe the overhead is ok



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to