acking-you commented on code in PR #15694:
URL: https://github.com/apache/datafusion/pull/15694#discussion_r2041635786


##########
datafusion/physical-expr/src/expressions/binary.rs:
##########
@@ -811,58 +822,164 @@ impl BinaryExpr {
     }
 }
 
+enum ShortCircuitStrategy<'a> {
+    None,
+    ReturnLeft,
+    ReturnRight,
+    PreSelection(&'a BooleanArray),
+}
+
+/// Based on the results calculated from the left side of the short-circuit 
operation,
+/// if the proportion of `true` is less than 0.2 and the current operation is 
an `and`,
+/// the `RecordBatch` will be filtered in advance.
+const PRE_SELECTIO_THRESHOLD: f32 = 0.2;
+
 /// Checks if a logical operator (`AND`/`OR`) can short-circuit evaluation 
based on the left-hand side (lhs) result.
 ///
-/// Short-circuiting occurs when evaluating the right-hand side (rhs) becomes 
unnecessary:
-/// - For `AND`: if ALL values in `lhs` are `false`, the expression must be 
`false` regardless of rhs.
-/// - For `OR`: if ALL values in `lhs` are `true`, the expression must be 
`true` regardless of rhs.
-///
-/// Returns `true` if short-circuiting is possible, `false` otherwise.
-///
+/// Short-circuiting occurs under these circumstances:
+/// - For `AND`:
+///    - if LHS is all false => short-circuit → return LHS
+///    - if LHS is all true  => short-circuit → return RHS
+///    - if LHS is mixed and true_count/sum_count <= 
[`PRE_SELECTIO_THRESHOLD`] -> pre-selection
+/// - For `OR`:
+///    - if LHS is all true  => short-circuit → return LHS
+///    - if LHS is all false => short-circuit → return RHS
 /// # Arguments
-/// * `arg` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
+/// * `lhs` - The left-hand side (lhs) columnar value (array or scalar)
 /// * `op` - The logical operator (`AND` or `OR`)
 ///
 /// # Implementation Notes
 /// 1. Only works with Boolean-typed arguments (other types automatically 
return `false`)
 /// 2. Handles both scalar values and array values
-/// 3. For arrays, uses optimized `true_count()`/`false_count()` methods from 
arrow-rs.
-///    `bool_or`/`bool_and` maybe a better choice too,for detailed 
discussion,see:[link](https://github.com/apache/datafusion/pull/15462#discussion_r2020558418)
-fn check_short_circuit(arg: &ColumnarValue, op: &Operator) -> bool {
-    let data_type = arg.data_type();
-    match (data_type, op) {
-        (DataType::Boolean, Operator::And) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.false_count() == array.len();
-                    }
+/// 3. For arrays, uses optimized bit counting techniques for boolean arrays
+fn check_short_circuit<'a>(
+    lhs: &'a ColumnarValue,
+    op: &Operator,
+) -> ShortCircuitStrategy<'a> {
+    // Quick reject for non-logical operators,and quick judgment when op is and
+    let is_and = match op {
+        Operator::And => true,
+        Operator::Or => false,
+        _ => return ShortCircuitStrategy::None,
+    };
+
+    // Non-boolean types can't be short-circuited
+    if lhs.data_type() != DataType::Boolean {
+        return ShortCircuitStrategy::None;
+    }
+
+    match lhs {
+        ColumnarValue::Array(array) => {
+            // Fast path for arrays - try to downcast to boolean array
+            if let Ok(bool_array) = as_boolean_array(array) {
+                // Arrays with nulls can't be short-circuited
+                if bool_array.null_count() > 0 {
+                    return ShortCircuitStrategy::None;
                 }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return !value;
+
+                let len = bool_array.len();
+                if len == 0 {
+                    return ShortCircuitStrategy::None;
+                }
+
+                let true_count = bool_array.values().count_set_bits();
+                if is_and {
+                    // For AND, prioritize checking for all-false (short 
circuit case)
+                    // Uses optimized false_count() method provided by Arrow
+
+                    // Short circuit if all values are false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no false values, then all must be true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnRight;
+                    }
+
+                    // determine if we can pre-selection
+                    if true_count as f32 / len as f32 <= 
PRE_SELECTIO_THRESHOLD {
+                        return ShortCircuitStrategy::PreSelection(bool_array);
+                    }
+                } else {
+                    // For OR, prioritize checking for all-true (short circuit 
case)
+                    // Uses optimized true_count() method provided by Arrow
+
+                    // Short circuit if all values are true
+                    if true_count == len {
+                        return ShortCircuitStrategy::ReturnLeft;
+                    }
+
+                    // If no true values, then all must be false
+                    if true_count == 0 {
+                        return ShortCircuitStrategy::ReturnRight;
                     }
                 }
             }
-            false
         }
-        (DataType::Boolean, Operator::Or) => {
-            match arg {
-                ColumnarValue::Array(array) => {
-                    if let Ok(array) = as_boolean_array(&array) {
-                        return array.true_count() == array.len();
-                    }
-                }
-                ColumnarValue::Scalar(scalar) => {
-                    if let ScalarValue::Boolean(Some(value)) = scalar {
-                        return *value;
-                    }
+        ColumnarValue::Scalar(scalar) => {
+            // Fast path for scalar values
+            if let ScalarValue::Boolean(Some(is_true)) = scalar {
+                // Return Left for:
+                // - AND with false value
+                // - OR with true value
+                if (is_and && !is_true) || (!is_and && *is_true) {
+                    return ShortCircuitStrategy::ReturnLeft;
+                } else {
+                    return ShortCircuitStrategy::ReturnRight;
                 }
             }
-            false
         }
-        _ => false,
     }
+
+    // If we can't short-circuit, indicate that normal evaluation should 
continue
+    ShortCircuitStrategy::None
+}
+
+/// FIXME: Perhaps it would be better to modify `left_result` directly without 
creating a copy?
+/// In practice, `left_result` should have only one owner, so making changes 
should be safe.
+/// However, this is difficult to achieve under the immutable constraints of 
[`Arc`] and [`BooleanArray`].
+fn pre_selection_scatter(
+    left_result: &BooleanArray,
+    right_result: ColumnarValue,
+) -> Result<ColumnarValue> {
+    let right_array = if let ColumnarValue::Array(array) = right_result {
+        array
+    } else {
+        return Ok(right_result);
+    };
+    let right_boolean_array = right_array.as_boolean();
+    let result_len = left_result.len();
+
+    // keep track of how much is filled
+    let mut filled = 0;
+    // keep track of current position we have in right boolean array
+    let mut right_array_pos = 0;
+
+    let mut result_array_builder = BooleanArray::builder(result_len);
+    SlicesIterator::new(left_result).for_each(|(start, end)| {
+        // the gap needs to be filled with false
+        if start > filled {
+            (filled..start).for_each(|_| 
result_array_builder.append_value(false));
+        }
+        // fill with right_result values
+        let len = end - start;
+        right_boolean_array
+            .slice(right_array_pos, len)
+            .iter()
+            .for_each(|v| result_array_builder.append_option(v));
+
+        right_array_pos += len;
+        filled = end;
+    });
+    // the remaining part is falsy
+    if filled < result_len {
+        (filled..result_len).for_each(|_| 
result_array_builder.append_value(false));
+    }
+    let boolean_result = result_array_builder.finish();
+
+    Ok(ColumnarValue::Array(Arc::new(boolean_result)))

Review Comment:
   `append_n` is indeed a more efficient choice. I checked its source code and 
found that it doesn't handle `append_n(0, true)` separately to skip it. 
However, for `append_n(0, false)`, it will be detected and skipped during 
`self.advance`. Since we will only append `false`, we don't need a separate 
check, but we should add a comment to clarify this behavior.
   
   I continued to check the source code and found that its 
`self.null_buffer_builder.append_n_non_nulls(additional)` will `append true`, 
so we still need an if statement to check whether it is 0.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to