This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new ee2dc833b0 Add an example of boundary analysis simple expressions.
(#14688)
ee2dc833b0 is described below
commit ee2dc833b0f13dc5d9bc1971e4c66748adfc9ef9
Author: @clflushopt <[email protected]>
AuthorDate: Mon Feb 17 07:14:07 2025 -0500
Add an example of boundary analysis simple expressions. (#14688)
* feat(examples): Add an example of boundary analysis for AND/OR exprs
The goal of this change is to add an example to explain data flow during
boundary analysis of AND and OR expressions.
* fix(examples): refine demo code for the example and cut the number of
cases
* fix(examples): remove left-over
* fix(examples): address linting issues
---
datafusion-examples/examples/expr_api.rs | 74 +++++++++++++++++++++++++++++++-
1 file changed, 73 insertions(+), 1 deletion(-)
diff --git a/datafusion-examples/examples/expr_api.rs
b/datafusion-examples/examples/expr_api.rs
index 2908edbb75..349850df61 100644
--- a/datafusion-examples/examples/expr_api.rs
+++ b/datafusion-examples/examples/expr_api.rs
@@ -22,8 +22,9 @@ use arrow::array::{BooleanArray, Int32Array, Int8Array};
use arrow::record_batch::RecordBatch;
use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+use datafusion::common::stats::Precision;
use datafusion::common::tree_node::{Transformed, TreeNode};
-use datafusion::common::DFSchema;
+use datafusion::common::{ColumnStatistics, DFSchema};
use datafusion::common::{ScalarValue, ToDFSchema};
use datafusion::error::Result;
use datafusion::functions_aggregate::first_last::first_value_udaf;
@@ -80,6 +81,9 @@ async fn main() -> Result<()> {
// See how to analyze ranges in expressions
range_analysis_demo()?;
+ // See how to analyze boundaries in different kinds of expressions.
+ boundary_analysis_and_selectivity_demo()?;
+
// See how to determine the data types of expressions
expression_type_demo()?;
@@ -275,6 +279,74 @@ fn range_analysis_demo() -> Result<()> {
Ok(())
}
+// DataFusion's analysis can infer boundary statistics and selectivity in
+// various situations which can be helpful in building more efficient
+// query plans.
+fn boundary_analysis_and_selectivity_demo() -> Result<()> {
+ // Consider the example where we want all rows with an `id` greater than
+ // 5000.
+ let id_greater_5000 = col("id").gt_eq(lit(5000i64));
+
+ // As in most examples we must tell DaataFusion the type of the column.
+ let schema = Arc::new(Schema::new(vec![make_field("id",
DataType::Int64)]));
+
+ // DataFusion is able to do cardinality estimation on various column types
+ // these estimates represented by the `ColumnStatistics` type describe
+ // properties such as the maximum and minimum value, the number of distinct
+ // values and the number of null values.
+ let column_stats = ColumnStatistics {
+ null_count: Precision::Exact(0),
+ max_value: Precision::Exact(ScalarValue::Int64(Some(10000))),
+ min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
+ sum_value: Precision::Absent,
+ distinct_count: Precision::Absent,
+ };
+
+ // We can then build our expression boundaries from the column statistics
+ // allowing the analysis to be more precise.
+ let initial_boundaries =
+ vec![ExprBoundaries::try_from_column(&schema, &column_stats, 0)?];
+
+ // With the above we can perform the boundary analysis similar to the
previous
+ // example.
+ let df_schema = DFSchema::try_from(schema.clone())?;
+
+ // Analysis case id >= 5000
+ let physical_expr1 =
+ SessionContext::new().create_physical_expr(id_greater_5000,
&df_schema)?;
+ let analysis = analyze(
+ &physical_expr1,
+ AnalysisContext::new(initial_boundaries.clone()),
+ df_schema.as_ref(),
+ )?;
+
+ // The analysis will return better bounds thanks to the column statistics.
+ assert_eq!(
+ analysis.boundaries.first().map(|boundary| boundary
+ .interval
+ .clone()
+ .unwrap()
+ .into_bounds()),
+ Some((
+ ScalarValue::Int64(Some(5000)),
+ ScalarValue::Int64(Some(10000))
+ ))
+ );
+
+ // We can also infer selectivity from the column statistics by assuming
+ // that the column is uniformly distributed and using the following
+ // estimation formula:
+ // Assuming the original range is [a, b] and the new range: [a', b']
+ //
+ // (a' - b' + 1) / (a - b)
+ // (10000 - 5000 + 1) / (10000 - 1)
+ assert!(analysis
+ .selectivity
+ .is_some_and(|selectivity| (0.5..=0.6).contains(&selectivity)));
+
+ Ok(())
+}
+
fn make_field(name: &str, data_type: DataType) -> Field {
let nullable = false;
Field::new(name, data_type, nullable)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]