kosiew commented on code in PR #15694:
URL: https://github.com/apache/datafusion/pull/15694#discussion_r2041328132


##########
datafusion/physical-expr/src/expressions/binary.rs:
##########
@@ -811,58 +822,164 @@ impl BinaryExpr {
     }
 }
 
+enum ShortCircuitStrategy<'a> {
+    None,
+    ReturnLeft,
+    ReturnRight,
+    PreSelection(&'a BooleanArray),
+}
+
+/// Based on the results calculated from the left side of the short-circuit 
operation,
+/// if the proportion of `true` is less than 0.2 and the current operation is 
an `and`,
+/// the `RecordBatch` will be filtered in advance.
+const PRE_SELECTIO_THRESHOLD: f32 = 0.2;

Review Comment:
   ```suggestion
   const PRE_SELECTION_THRESHOLD: f32 = 0.2;
   ```



##########
datafusion/physical-expr/src/expressions/binary.rs:
##########
@@ -811,58 +822,164 @@ impl BinaryExpr {
     }
 }
 
+enum ShortCircuitStrategy<'a> {
+    None,
+    ReturnLeft,
+    ReturnRight,
+    PreSelection(&'a BooleanArray),
+}
+
+/// Based on the results calculated from the left side of the short-circuit 
operation,
+/// if the proportion of `true` is less than 0.2 and the current operation is 
an `and`,
+/// the `RecordBatch` will be filtered in advance.
+const PRE_SELECTIO_THRESHOLD: f32 = 0.2;
+
 /// Checks if a logical operator (`AND`/`OR`) can short-circuit evaluation 
based on the left-hand side (lhs) result.
 ///
-/// Short-circuiting occurs when evaluating the right-hand side (rhs) becomes 
unnecessary:
-/// - For `AND`: if ALL values in `lhs` are `false`, the expression must be 
`false` regardless of rhs.
-/// - For `OR`: if ALL values in `lhs` are `true`, the expression must be 
`true` regardless of rhs.
-///
-/// Returns `true` if short-circuiting is possible, `false` otherwise.
-///
+/// Short-circuiting occurs under these circumstances:
+/// - For `AND`:
+///    - if LHS is all false => short-circuit → return LHS
+///    - if LHS is all true  => short-circuit → return RHS
+///    - if LHS is mixed and true_count/sum_count <= 
[`PRE_SELECTIO_THRESHOLD`] -> pre-selection
+/// - For `OR`:
+///    - if LHS is all true  => short-circuit → return LHS
+///    - if LHS is all false => short-circuit → return RHS
 /// # Arguments
-/// * `arg` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
 /// * `op` - The logical operator (`AND` or `OR`)
 ///
 /// # Implementation Notes
 /// 1. Only works with Boolean-typed arguments (other types automatically 
return `false`)
 /// 2. Handles both scalar values and array values
-/// 3. For arrays, uses optimized `true_count()`/`false_count()` methods from 
arrow-rs.
-///    `bool_or`/`bool_and` maybe a better choice too,for detailed 
discussion,see:[link](https://github.com/apache/datafusion/pull/15462#discussion_r2020558418)
-fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool {
-    let data_type = arg.data_type();
-    match (data_type, op) {
-        (DataType::Boolean, Operator::And) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.false_count() == array.len();
-                    }
+/// 3. For arrays, uses optimized bit counting techniques for boolean arrays
+fn check_short_circuit<'a>(
+    lhs: &'a ColumnarValue,
+    op: &Operator,
+) -> ShortCircuitStrategy<'a> {
+    // Quick reject for non-logical operators,and quick judgment when op is and
+    let is_and = match op {
+        Operator::And => true,
+        Operator::Or => false,
+        _ => return ShortCircuitStrategy::None,
+    };
+
+    // Non-boolean types can't be short-circuited
+    if lhs.data_type() != DataType::Boolean {
+        return ShortCircuitStrategy::None;
+    }
+
+    match lhs {
+        ColumnarValue::Array(array) => {
+            // Fast path for arrays - try to downcast to boolean array
+            if let Ok(bool_array) = as_boolean_array(array) {
+                // Arrays with nulls can't be short-circuited
+                if bool_array.null_count() > 0 {
+                    return ShortCircuitStrategy::None;
                 }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return !value;
+
+                let len = bool_array.len();
+                if len == 0 {
+                    return ShortCircuitStrategy::None;
+                }
+
+                let true_count = bool_array.values().count_set_bits();
+                if is_and {
+                    // For AND, prioritize checking for all-false (short 
circuit case)
+                    // Uses optimized false_count() method provided by Arrow
+
+                    // Short circuit if all values are false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no false values, then all must be true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnRight;
+                    }
+
+                    // determine if we can pre-selection
+                    if true_count as f32 / len as f32 <= 
PRE_SELECTIO_THRESHOLD {
+                        return ShortCircuitStrategy::PreSelection(bool_array);
+                    }
+                } else {
+                    // For OR, prioritize checking for all-true (short circuit 
case)
+                    // Uses optimized true_count() method provided by Arrow
+
+                    // Short circuit if all values are true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no true values, then all must be false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnRight;
                     }
                 }
             }
-            false
         }
-        (DataType::Boolean, Operator::Or) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.true_count() == array.len();
-                    }
-                }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return *value;
-                    }
+        ColumnarValue::Scalar(scalar) => {
+            // Fast path for scalar values
+            if let ScalarValue::Boolean(Some(is_true)) = scalar {
+                // Return Left for:
+                // - AND with false value
+                // - OR with true value
+                if (is_and && !is_true) || (!is_and && *is_true) {
+                    return ShortCircuitStrategy::ReturnLeft;
+                } else {
+                    return ShortCircuitStrategy::ReturnRight;
                 }
             }
-            false
         }
-        _ => false,
     }
+
+    // If we can't short-circuit, indicate that normal evaluation should 
continue
+    ShortCircuitStrategy::None
+}
+
+/// FIXME: Perhaps it would be better to modify `left_result` directly without 
creating a copy?
+/// In practice, `left_result` should have only one owner, so making changes 
should be safe.
+/// However, this is difficult to achieve under the immutable constraints of 
[`Arc`] and [`BooleanArray`].
+fn pre_selection_scatter(
+    left_result: &BooleanArray,
+    right_result: ColumnarValue,
+) -> Result<ColumnarValue> {
+    let right_array = if let ColumnarValue::Array(array) = right_result {
+        array
+    } else {
+        return Ok(right_result);
+    };
+    let right_boolean_array = right_array.as_boolean();
+    let result_len = left_result.len();
+
+    // keep track of how much is filled
+    let mut filled = 0;
+    // keep track of current position we have in right boolean array
+    let mut right_array_pos = 0;
+
+    let mut result_array_builder = BooleanArray::builder(result_len);
+    SlicesIterator::new(left_result).for_each(|(start, end)| {
+        // the gap needs to be filled with false
+        if start > filled {
+            (filled..start).for_each(|_| 
result_array_builder.append_value(false));
+        }
+        // fill with right_result values
+        let len = end - start;
+        right_boolean_array
+            .slice(right_array_pos, len)
+            .iter()
+            .for_each(|v| result_array_builder.append_option(v));
+
+        right_array_pos += len;
+        filled = end;
+    });
+    // the remaining part is falsy
+    if filled < result_len {
+        (filled..result_len).for_each(|_| 
result_array_builder.append_value(false));
+    }
+    let boolean_result = result_array_builder.finish();
+
+    Ok(ColumnarValue::Array(Arc::new(boolean_result)))

Review Comment:
   Here's an alternative approach using last_end, instead of filled.
   
   last_end tracks the end of the last filled region.
   ```
       // Build the resulting array efficiently
       let mut result_array_builder = BooleanArray::builder(result_len);
       let mut right_pos = 0;
   
       // Process the array using iterator of slices (contiguous true regions)
       let mut last_end = 0;
       for (start, end) in SlicesIterator::new(left_result) {
           // Fill the gap between last processed position and current slice 
start with false
           result_array_builder.append_n(start - last_end, false);
   
           // Copy values from right array for this slice
           let slice_len = end - start;
           right_boolean_array
               .slice(right_pos, slice_len)
               .iter()
               .for_each(|v| result_array_builder.append_option(v));
   
           right_pos += slice_len;
           last_end = end;
       }
   
       // Fill any remaining positions with false
       if last_end < result_len {
           result_array_builder.append_n(result_len - last_end, false);
       }
   
       Ok(ColumnarValue::Array(Arc::new(result_array_builder.finish())))
    ```



##########
datafusion/physical-expr/src/expressions/binary.rs:
##########
@@ -811,58 +822,164 @@ impl BinaryExpr {
     }
 }
 
+enum ShortCircuitStrategy<'a> {
+    None,
+    ReturnLeft,
+    ReturnRight,
+    PreSelection(&'a BooleanArray),
+}
+
+/// Based on the results calculated from the left side of the short-circuit 
operation,
+/// if the proportion of `true` is less than 0.2 and the current operation is 
an `and`,
+/// the `RecordBatch` will be filtered in advance.
+const PRE_SELECTIO_THRESHOLD: f32 = 0.2;
+
 /// Checks if a logical operator (`AND`/`OR`) can short-circuit evaluation 
based on the left-hand side (lhs) result.
 ///
-/// Short-circuiting occurs when evaluating the right-hand side (rhs) becomes 
unnecessary:
-/// - For `AND`: if ALL values in `lhs` are `false`, the expression must be 
`false` regardless of rhs.
-/// - For `OR`: if ALL values in `lhs` are `true`, the expression must be 
`true` regardless of rhs.
-///
-/// Returns `true` if short-circuiting is possible, `false` otherwise.
-///
+/// Short-circuiting occurs under these circumstances:
+/// - For `AND`:
+///    - if LHS is all false => short-circuit → return LHS
+///    - if LHS is all true  => short-circuit → return RHS
+///    - if LHS is mixed and true_count/sum_count <= 
[`PRE_SELECTIO_THRESHOLD`] -> pre-selection
+/// - For `OR`:
+///    - if LHS is all true  => short-circuit → return LHS
+///    - if LHS is all false => short-circuit → return RHS
 /// # Arguments
-/// * `arg` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
 /// * `op` - The logical operator (`AND` or `OR`)
 ///
 /// # Implementation Notes
 /// 1. Only works with Boolean-typed arguments (other types automatically 
return `false`)
 /// 2. Handles both scalar values and array values
-/// 3. For arrays, uses optimized `true_count()`/`false_count()` methods from 
arrow-rs.
-///    `bool_or`/`bool_and` maybe a better choice too,for detailed 
discussion,see:[link](https://github.com/apache/datafusion/pull/15462#discussion_r2020558418)
-fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool {
-    let data_type = arg.data_type();
-    match (data_type, op) {
-        (DataType::Boolean, Operator::And) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.false_count() == array.len();
-                    }
+/// 3. For arrays, uses optimized bit counting techniques for boolean arrays
+fn check_short_circuit<'a>(
+    lhs: &'a ColumnarValue,
+    op: &Operator,
+) -> ShortCircuitStrategy<'a> {
+    // Quick reject for non-logical operators,and quick judgment when op is and
+    let is_and = match op {
+        Operator::And => true,
+        Operator::Or => false,
+        _ => return ShortCircuitStrategy::None,
+    };
+
+    // Non-boolean types can't be short-circuited
+    if lhs.data_type() != DataType::Boolean {
+        return ShortCircuitStrategy::None;
+    }
+
+    match lhs {
+        ColumnarValue::Array(array) => {
+            // Fast path for arrays - try to downcast to boolean array
+            if let Ok(bool_array) = as_boolean_array(array) {
+                // Arrays with nulls can't be short-circuited
+                if bool_array.null_count() > 0 {
+                    return ShortCircuitStrategy::None;
                 }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return !value;
+
+                let len = bool_array.len();
+                if len == 0 {
+                    return ShortCircuitStrategy::None;
+                }
+
+                let true_count = bool_array.values().count_set_bits();
+                if is_and {
+                    // For AND, prioritize checking for all-false (short 
circuit case)
+                    // Uses optimized false_count() method provided by Arrow
+
+                    // Short circuit if all values are false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no false values, then all must be true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnRight;
+                    }
+
+                    // determine if we can pre-selection
+                    if true_count as f32 / len as f32 <= 
PRE_SELECTIO_THRESHOLD {
+                        return ShortCircuitStrategy::PreSelection(bool_array);
+                    }
+                } else {
+                    // For OR, prioritize checking for all-true (short circuit 
case)
+                    // Uses optimized true_count() method provided by Arrow
+
+                    // Short circuit if all values are true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no true values, then all must be false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnRight;
                     }
                 }
             }
-            false
         }
-        (DataType::Boolean, Operator::Or) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.true_count() == array.len();
-                    }
-                }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return *value;
-                    }
+        ColumnarValue::Scalar(scalar) => {
+            // Fast path for scalar values
+            if let ScalarValue::Boolean(Some(is_true)) = scalar {
+                // Return Left for:
+                // - AND with false value
+                // - OR with true value
+                if (is_and && !is_true) || (!is_and && *is_true) {
+                    return ShortCircuitStrategy::ReturnLeft;
+                } else {
+                    return ShortCircuitStrategy::ReturnRight;
                 }
             }
-            false
         }
-        _ => false,
     }
+
+    // If we can't short-circuit, indicate that normal evaluation should 
continue
+    ShortCircuitStrategy::None
+}
+
+/// FIXME: Perhaps it would be better to modify `left_result` directly without 
creating a copy?
+/// In practice, `left_result` should have only one owner, so making changes 
should be safe.
+/// However, this is difficult to achieve under the immutable constraints of 
[`Arc`] and [`BooleanArray`].
+fn pre_selection_scatter(
+    left_result: &BooleanArray,
+    right_result: ColumnarValue,
+) -> Result<ColumnarValue> {
+    let right_array = if let ColumnarValue::Array(array) = right_result {
+        array
+    } else {
+        return Ok(right_result);
+    };

Review Comment:
   ```
       // Return scalar values as-is
       if let ColumnarValue::Scalar(_) = right_result {
           return Ok(right_result);
       }
   
       let right_array = right_result.into_array(left_result.len())?;
   ```
   Maybe this reads easier?    



##########
datafusion/physical-expr/src/expressions/binary.rs:
##########
@@ -811,58 +822,164 @@ impl BinaryExpr {
     }
 }
 
+enum ShortCircuitStrategy<'a> {
+    None,
+    ReturnLeft,
+    ReturnRight,
+    PreSelection(&'a BooleanArray),
+}
+
+/// Based on the results calculated from the left side of the short-circuit 
operation,
+/// if the proportion of `true` is less than 0.2 and the current operation is 
an `and`,
+/// the `RecordBatch` will be filtered in advance.
+const PRE_SELECTIO_THRESHOLD: f32 = 0.2;
+
 /// Checks if a logical operator (`AND`/`OR`) can short-circuit evaluation 
based on the left-hand side (lhs) result.
 ///
-/// Short-circuiting occurs when evaluating the right-hand side (rhs) becomes 
unnecessary:
-/// - For `AND`: if ALL values in `lhs` are `false`, the expression must be 
`false` regardless of rhs.
-/// - For `OR`: if ALL values in `lhs` are `true`, the expression must be 
`true` regardless of rhs.
-///
-/// Returns `true` if short-circuiting is possible, `false` otherwise.
-///
+/// Short-circuiting occurs under these circumstances:
+/// - For `AND`:
+///    - if LHS is all false => short-circuit → return LHS
+///    - if LHS is all true  => short-circuit → return RHS
+///    - if LHS is mixed and true_count/sum_count <= 
[`PRE_SELECTIO_THRESHOLD`] -> pre-selection
+/// - For `OR`:
+///    - if LHS is all true  => short-circuit → return LHS
+///    - if LHS is all false => short-circuit → return RHS
 /// # Arguments
-/// * `arg` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
 /// * `op` - The logical operator (`AND` or `OR`)
 ///
 /// # Implementation Notes
 /// 1. Only works with Boolean-typed arguments (other types automatically 
return `false`)
 /// 2. Handles both scalar values and array values
-/// 3. For arrays, uses optimized `true_count()`/`false_count()` methods from 
arrow-rs.
-///    `bool_or`/`bool_and` maybe a better choice too,for detailed 
discussion,see:[link](https://github.com/apache/datafusion/pull/15462#discussion_r2020558418)
-fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool {
-    let data_type = arg.data_type();
-    match (data_type, op) {
-        (DataType::Boolean, Operator::And) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.false_count() == array.len();
-                    }
+/// 3. For arrays, uses optimized bit counting techniques for boolean arrays
+fn check_short_circuit<'a>(
+    lhs: &'a ColumnarValue,
+    op: &Operator,
+) -> ShortCircuitStrategy<'a> {
+    // Quick reject for non-logical operators,and quick judgment when op is and
+    let is_and = match op {
+        Operator::And => true,
+        Operator::Or => false,
+        _ => return ShortCircuitStrategy::None,
+    };
+
+    // Non-boolean types can't be short-circuited
+    if lhs.data_type() != DataType::Boolean {
+        return ShortCircuitStrategy::None;
+    }
+
+    match lhs {
+        ColumnarValue::Array(array) => {
+            // Fast path for arrays - try to downcast to boolean array
+            if let Ok(bool_array) = as_boolean_array(array) {
+                // Arrays with nulls can't be short-circuited
+                if bool_array.null_count() > 0 {
+                    return ShortCircuitStrategy::None;
                 }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return !value;
+
+                let len = bool_array.len();
+                if len == 0 {
+                    return ShortCircuitStrategy::None;
+                }
+
+                let true_count = bool_array.values().count_set_bits();
+                if is_and {
+                    // For AND, prioritize checking for all-false (short 
circuit case)
+                    // Uses optimized false_count() method provided by Arrow
+
+                    // Short circuit if all values are false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no false values, then all must be true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnRight;
+                    }
+
+                    // determine if we can pre-selection
+                    if true_count as f32 / len as f32 <= 
PRE_SELECTIO_THRESHOLD {
+                        return ShortCircuitStrategy::PreSelection(bool_array);
+                    }
+                } else {
+                    // For OR, prioritize checking for all-true (short circuit 
case)
+                    // Uses optimized true_count() method provided by Arrow
+
+                    // Short circuit if all values are true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no true values, then all must be false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnRight;
                     }
                 }
             }
-            false
         }
-        (DataType::Boolean, Operator::Or) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.true_count() == array.len();
-                    }
-                }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return *value;
-                    }
+        ColumnarValue::Scalar(scalar) => {
+            // Fast path for scalar values
+            if let ScalarValue::Boolean(Some(is_true)) = scalar {
+                // Return Left for:
+                // - AND with false value
+                // - OR with true value
+                if (is_and && !is_true) || (!is_and && *is_true) {
+                    return ShortCircuitStrategy::ReturnLeft;
+                } else {
+                    return ShortCircuitStrategy::ReturnRight;
                 }
             }
-            false
         }
-        _ => false,
     }
+
+    // If we can't short-circuit, indicate that normal evaluation should 
continue
+    ShortCircuitStrategy::None
+}
+
+/// FIXME: Perhaps it would be better to modify `left_result` directly without 
creating a copy?
+/// In practice, `left_result` should have only one owner, so making changes 
should be safe.
+/// However, this is difficult to achieve under the immutable constraints of 
[`Arc`] and [`BooleanArray`].

Review Comment:
   ```suggestion
   /// FIXME: Perhaps it would be better to modify `left_result` directly 
without creating a copy?
   /// In practice, `left_result` should have only one owner, so making changes 
should be safe.
   /// However, this is difficult to achieve under the immutable constraints of 
[`Arc`] and [`BooleanArray`].
   /// Creates a new boolean array based on the evaluation of the right 
expression,
   /// but only for positions where the left_result is true.
   ///
   /// This function is used for short-circuit evaluation optimization of 
logical AND operations:
   /// - When left_result has few true values, we only evaluate the right 
expression for those positions
   /// - Values are copied from right_array where left_result is true
   /// - All other positions are filled with false values
   ///
   /// @param left_result Boolean array with selection mask (typically from 
left side of AND)
   /// @param right_result Result of evaluating right side of expression (only 
for selected positions)
   /// @return A combined ColumnarValue with values from right_result where 
left_result is true
   ```



##########
datafusion/physical-expr/src/expressions/binary.rs:
##########
@@ -811,58 +822,164 @@ impl BinaryExpr {
     }
 }
 
+enum ShortCircuitStrategy<'a> {
+    None,
+    ReturnLeft,
+    ReturnRight,
+    PreSelection(&'a BooleanArray),
+}
+
+/// Based on the results calculated from the left side of the short-circuit 
operation,
+/// if the proportion of `true` is less than 0.2 and the current operation is 
an `and`,
+/// the `RecordBatch` will be filtered in advance.
+const PRE_SELECTIO_THRESHOLD: f32 = 0.2;
+
 /// Checks if a logical operator (`AND`/`OR`) can short-circuit evaluation 
based on the left-hand side (lhs) result.
 ///
-/// Short-circuiting occurs when evaluating the right-hand side (rhs) becomes 
unnecessary:
-/// - For `AND`: if ALL values in `lhs` are `false`, the expression must be 
`false` regardless of rhs.
-/// - For `OR`: if ALL values in `lhs` are `true`, the expression must be 
`true` regardless of rhs.
-///
-/// Returns `true` if short-circuiting is possible, `false` otherwise.
-///
+/// Short-circuiting occurs under these circumstances:
+/// - For `AND`:
+///    - if LHS is all false => short-circuit → return LHS
+///    - if LHS is all true  => short-circuit → return RHS
+///    - if LHS is mixed and true_count/sum_count <= 
[`PRE_SELECTIO_THRESHOLD`] -> pre-selection
+/// - For `OR`:
+///    - if LHS is all true  => short-circuit → return LHS
+///    - if LHS is all false => short-circuit → return RHS
 /// # Arguments
-/// * `arg` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
 /// * `op` - The logical operator (`AND` or `OR`)
 ///
 /// # Implementation Notes
 /// 1. Only works with Boolean-typed arguments (other types automatically 
return `false`)
 /// 2. Handles both scalar values and array values
-/// 3. For arrays, uses optimized `true_count()`/`false_count()` methods from 
arrow-rs.
-///    `bool_or`/`bool_and` maybe a better choice too,for detailed 
discussion,see:[link](https://github.com/apache/datafusion/pull/15462#discussion_r2020558418)
-fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool {
-    let data_type = arg.data_type();
-    match (data_type, op) {
-        (DataType::Boolean, Operator::And) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.false_count() == array.len();
-                    }
+/// 3. For arrays, uses optimized bit counting techniques for boolean arrays
+fn check_short_circuit<'a>(
+    lhs: &'a ColumnarValue,
+    op: &Operator,
+) -> ShortCircuitStrategy<'a> {
+    // Quick reject for non-logical operators,and quick judgment when op is and
+    let is_and = match op {
+        Operator::And => true,
+        Operator::Or => false,
+        _ => return ShortCircuitStrategy::None,
+    };
+
+    // Non-boolean types can't be short-circuited
+    if lhs.data_type() != DataType::Boolean {
+        return ShortCircuitStrategy::None;
+    }
+
+    match lhs {
+        ColumnarValue::Array(array) => {
+            // Fast path for arrays - try to downcast to boolean array
+            if let Ok(bool_array) = as_boolean_array(array) {
+                // Arrays with nulls can't be short-circuited
+                if bool_array.null_count() > 0 {
+                    return ShortCircuitStrategy::None;
                 }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return !value;
+
+                let len = bool_array.len();
+                if len == 0 {
+                    return ShortCircuitStrategy::None;
+                }
+
+                let true_count = bool_array.values().count_set_bits();
+                if is_and {
+                    // For AND, prioritize checking for all-false (short 
circuit case)
+                    // Uses optimized false_count() method provided by Arrow
+
+                    // Short circuit if all values are false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no false values, then all must be true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnRight;
+                    }
+
+                    // determine if we can pre-selection
+                    if true_count as f32 / len as f32 <= 
PRE_SELECTIO_THRESHOLD {
+                        return ShortCircuitStrategy::PreSelection(bool_array);
+                    }
+                } else {
+                    // For OR, prioritize checking for all-true (short circuit 
case)
+                    // Uses optimized true_count() method provided by Arrow
+
+                    // Short circuit if all values are true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no true values, then all must be false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnRight;
                     }
                 }
             }
-            false
         }
-        (DataType::Boolean, Operator::Or) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.true_count() == array.len();
-                    }
-                }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return *value;
-                    }
+        ColumnarValue::Scalar(scalar) => {
+            // Fast path for scalar values
+            if let ScalarValue::Boolean(Some(is_true)) = scalar {
+                // Return Left for:
+                // - AND with false value
+                // - OR with true value
+                if (is_and && !is_true) || (!is_and && *is_true) {
+                    return ShortCircuitStrategy::ReturnLeft;
+                } else {
+                    return ShortCircuitStrategy::ReturnRight;
                 }
             }
-            false
         }
-        _ => false,
     }
+
+    // If we can't short-circuit, indicate that normal evaluation should 
continue
+    ShortCircuitStrategy::None
+}
+
+/// FIXME: Perhaps it would be better to modify `left_result` directly without 
creating a copy?
+/// In practice, `left_result` should have only one owner, so making changes 
should be safe.
+/// However, this is difficult to achieve under the immutable constraints of 
[`Arc`] and [`BooleanArray`].
+fn pre_selection_scatter(
+    left_result: &BooleanArray,
+    right_result: ColumnarValue,
+) -> Result<ColumnarValue> {
+    let right_array = if let ColumnarValue::Array(array) = right_result {
+        array
+    } else {
+        return Ok(right_result);
+    };
+    let right_boolean_array = right_array.as_boolean();
+    let result_len = left_result.len();
+
+    // keep track of how much is filled
+    let mut filled = 0;
+    // keep track of current position we have in right boolean array
+    let mut right_array_pos = 0;
+
+    let mut result_array_builder = BooleanArray::builder(result_len);
+    SlicesIterator::new(left_result).for_each(|(start, end)| {
+        // the gap needs to be filled with false
+        if start > filled {
+            (filled..start).for_each(|_| 
result_array_builder.append_value(false));
+        }
+        // fill with right_result values
+        let len = end - start;
+        right_boolean_array
+            .slice(right_array_pos, len)
+            .iter()
+            .for_each(|v| result_array_builder.append_option(v));
+
+        right_array_pos += len;
+        filled = end;
+    });
+    // the remaining part is falsy
+    if filled < result_len {
+        (filled..result_len).for_each(|_| 
result_array_builder.append_value(false));
+    }
+    let boolean_result = result_array_builder.finish();
+
+    Ok(ColumnarValue::Array(Arc::new(boolean_result)))

Review Comment:
   Interestingly, this improved the performance further.
   
   <img width="468" alt="image" 
src="https://github.com/user-attachments/assets/98c89c3d-5eb0-43a9-822c-40156e280f4a";
 />
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to