alamb commented on code in PR #3868:
URL: https://github.com/apache/arrow-datafusion/pull/3868#discussion_r999964895


##########
datafusion/physical-expr/src/physical_expr.rs:
##########
@@ -61,6 +62,81 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug {
             Ok(tmp_result)
         }
     }
+    /// Return the expression statistics for this expression. This API is 
currently experimental.
+    fn expr_stats(&self) -> Arc<dyn PhysicalExprStats> {
+        Arc::new(BasicExpressionStats {})
+    }
+}
+
+/// Statistics about the result of a single expression.
+#[derive(Clone, Debug, PartialEq)]
+pub struct ExprBoundaries {
+    /// Maximum value this expression's result can have.
+    pub max_value: ScalarValue,
+    /// Minimum value this expression's result can have.
+    pub min_value: ScalarValue,
+    /// Maximum number of distinct values this expression can produce.
+    pub distinct_count: Option<usize>,
+    /// Selectivity of this expression if it were used as a predicate.
+    pub selectivity: Option<f64>,
+}
+
+impl ExprBoundaries {
+    /// Create a new `ExprBoundaries`.
+    pub fn new(
+        max_value: ScalarValue,
+        min_value: ScalarValue,
+        distinct_count: Option<usize>,
+    ) -> Self {
+        Self {
+            max_value,
+            min_value,
+            distinct_count,
+            selectivity: None,
+        }
+    }
+
+    /// Try to reduce the expression boundaries to a single value if possible.
+    pub fn reduce(&self) -> Option<ScalarValue> {
+        if self.min_value == self.max_value {
+            Some(self.min_value.clone())
+        } else {
+            None
+        }
+    }
+}
+
+/// A toolkit to work with physical expressions statistics. This API is 
currently experimental
+/// and might be subject to change.
+pub trait PhysicalExprStats: Send + Sync {
+    /// Return an estimate about the boundaries of this expression's result 
would have (in
+    /// terms of minimum and maximum values it can take as well the number of 
unique values
+    /// it can produce). The inputs are the column-level statistics from the 
current physical
+    /// plan.
+    fn boundaries(&self, columns: &[ColumnStatistics]) -> 
Option<ExprBoundaries>;
+
+    #[allow(unused_variables)]
+    /// Apply the given boundaries to this column. Currently only applicable 
for top level columns.
+    fn update_boundaries(
+        &self,
+        columns: &[ColumnStatistics],
+        boundaries: &ExprBoundaries,
+    ) -> Vec<ColumnStatistics> {
+        // TODO: for supporting recursive boundary updates, we need to have 
per-column level
+        // expression boundaries with known ids (either indexes or something 
like that).
+        columns.to_vec()
+    }

Review Comment:
   It is an interesting idea to pass back a `Vec<ExprBoundary>` so we can keep 
partial knowledge about range boundaries



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to