(datafusion) branch main updated: Add an example of boundary analysis simple expressions. (#14688)

alamb Mon, 17 Feb 2025 04:36:09 -0800

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new ee2dc833b0 Add an example of boundary analysis simple expressions. 
(#14688)
ee2dc833b0 is described below

commit ee2dc833b0f13dc5d9bc1971e4c66748adfc9ef9
Author: @clflushopt <[email protected]>
AuthorDate: Mon Feb 17 07:14:07 2025 -0500

    Add an example of boundary analysis simple expressions. (#14688)
    
    * feat(examples): Add an example of boundary analysis for AND/OR exprs
    
    The goal of this change is to add an example to explain data flow during
    boundary analysis of AND and OR expressions.
    
    * fix(examples): refine demo code for the example and cut the number of 
cases
    
    * fix(examples): remove left-over
    
    * fix(examples): address linting issues
---
 datafusion-examples/examples/expr_api.rs | 74 +++++++++++++++++++++++++++++++-
 1 file changed, 73 insertions(+), 1 deletion(-)

diff --git a/datafusion-examples/examples/expr_api.rs 
b/datafusion-examples/examples/expr_api.rs
index 2908edbb75..349850df61 100644
--- a/datafusion-examples/examples/expr_api.rs
+++ b/datafusion-examples/examples/expr_api.rs
@@ -22,8 +22,9 @@ use arrow::array::{BooleanArray, Int32Array, Int8Array};
 use arrow::record_batch::RecordBatch;
 
 use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+use datafusion::common::stats::Precision;
 use datafusion::common::tree_node::{Transformed, TreeNode};
-use datafusion::common::DFSchema;
+use datafusion::common::{ColumnStatistics, DFSchema};
 use datafusion::common::{ScalarValue, ToDFSchema};
 use datafusion::error::Result;
 use datafusion::functions_aggregate::first_last::first_value_udaf;
@@ -80,6 +81,9 @@ async fn main() -> Result<()> {
     // See how to analyze ranges in expressions
     range_analysis_demo()?;
 
+    // See how to analyze boundaries in different kinds of expressions.
+    boundary_analysis_and_selectivity_demo()?;
+
     // See how to determine the data types of expressions
     expression_type_demo()?;
 
@@ -275,6 +279,74 @@ fn range_analysis_demo() -> Result<()> {
     Ok(())
 }
 
+// DataFusion's analysis can infer boundary statistics and selectivity in
+// various situations which can be helpful in building more efficient
+// query plans.
+fn boundary_analysis_and_selectivity_demo() -> Result<()> {
+    // Consider the example where we want all rows with an `id` greater than
+    // 5000.
+    let id_greater_5000 = col("id").gt_eq(lit(5000i64));
+
+    // As in most examples we must tell DaataFusion the type of the column.
+    let schema = Arc::new(Schema::new(vec![make_field("id", 
DataType::Int64)]));
+
+    // DataFusion is able to do cardinality estimation on various column types
+    // these estimates represented by the `ColumnStatistics` type describe
+    // properties such as the maximum and minimum value, the number of distinct
+    // values and the number of null values.
+    let column_stats = ColumnStatistics {
+        null_count: Precision::Exact(0),
+        max_value: Precision::Exact(ScalarValue::Int64(Some(10000))),
+        min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
+        sum_value: Precision::Absent,
+        distinct_count: Precision::Absent,
+    };
+
+    // We can then build our expression boundaries from the column statistics
+    // allowing the analysis to be more precise.
+    let initial_boundaries =
+        vec![ExprBoundaries::try_from_column(&schema, &column_stats, 0)?];
+
+    // With the above we can perform the boundary analysis similar to the 
previous
+    // example.
+    let df_schema = DFSchema::try_from(schema.clone())?;
+
+    // Analysis case id >= 5000
+    let physical_expr1 =
+        SessionContext::new().create_physical_expr(id_greater_5000, 
&df_schema)?;
+    let analysis = analyze(
+        &physical_expr1,
+        AnalysisContext::new(initial_boundaries.clone()),
+        df_schema.as_ref(),
+    )?;
+
+    // The analysis will return better bounds thanks to the column statistics.
+    assert_eq!(
+        analysis.boundaries.first().map(|boundary| boundary
+            .interval
+            .clone()
+            .unwrap()
+            .into_bounds()),
+        Some((
+            ScalarValue::Int64(Some(5000)),
+            ScalarValue::Int64(Some(10000))
+        ))
+    );
+
+    // We can also infer selectivity from the column statistics by assuming
+    // that the column is uniformly distributed and using the following
+    // estimation formula:
+    // Assuming the original range is [a, b] and the new range: [a', b']
+    //
+    // (a' - b' + 1) / (a - b)
+    // (10000 - 5000 + 1) / (10000 - 1)
+    assert!(analysis
+        .selectivity
+        .is_some_and(|selectivity| (0.5..=0.6).contains(&selectivity)));
+
+    Ok(())
+}
+
 fn make_field(name: &str, data_type: DataType) -> Field {
     let nullable = false;
     Field::new(name, data_type, nullable)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion) branch main updated: Add an example of boundary analysis simple expressions. (#14688)

Reply via email to