adriangb commented on code in PR #19136:
URL: https://github.com/apache/datafusion/pull/19136#discussion_r2596201331
##########
datafusion/datasource-parquet/src/opener.rs:
##########
@@ -521,6 +547,102 @@ fn copy_arrow_reader_metrics(
}
}
+type ConstantColumns = HashMap<usize, ScalarValue>;
+
+/// Extract constant column values from statistics, keyed by column index in
the logical file schema.
+fn constant_columns_from_stats(
+ statistics: Option<&Statistics>,
+ file_schema: &SchemaRef,
+) -> ConstantColumns {
+ let mut constants = HashMap::new();
+ let Some(statistics) = statistics else {
+ return constants;
+ };
+
+ let num_rows = match statistics.num_rows {
+ Precision::Exact(num_rows) => Some(num_rows),
+ _ => None,
+ };
+
+ for (idx, column_stats) in statistics
+ .column_statistics
+ .iter()
+ .take(file_schema.fields().len())
+ .enumerate()
+ {
+ if let Some(value) = constant_value_from_stats(
+ column_stats,
+ num_rows,
+ file_schema.field(idx).data_type(),
+ ) {
+ constants.insert(idx, value);
+ }
+ }
+
+ constants
+}
+
+fn constant_value_from_stats(
+ column_stats: &ColumnStatistics,
+ num_rows: Option<usize>,
+ data_type: &DataType,
+) -> Option<ScalarValue> {
+ if let (Precision::Exact(min), Precision::Exact(max)) =
+ (&column_stats.min_value, &column_stats.max_value)
+ {
+ if min == max
+ && !min.is_null()
+ && matches!(column_stats.null_count, Precision::Exact(0))
+ {
+ return Some(min.clone());
+ }
+ }
+
+ if let (Some(num_rows), Precision::Exact(nulls)) =
+ (num_rows, &column_stats.null_count)
+ {
+ if *nulls == num_rows {
+ return ScalarValue::try_new_null(data_type).ok();
+ }
+ }
+
+ None
+}
+
+fn rewrite_projection_with_constants(
+ projection: ProjectionExprs,
+ constants: &ConstantColumns,
+) -> Result<ProjectionExprs> {
+ if constants.is_empty() {
+ return Ok(projection);
+ }
+
+ projection.try_map_exprs(|expr| rewrite_physical_expr_with_constants(expr,
constants))
+}
+
+fn rewrite_physical_expr_with_constants(
Review Comment:
This looks very similar to
https://github.com/apache/datafusion/pull/19128/files#diff-6bad7e4ee6dbc3a498e3fee746f2c3c18bdcf237d7cd12226e392f9b9c3d2fbe,
we should be able to use it for partition values as well 😄
##########
datafusion/datasource-parquet/src/opener.rs:
##########
@@ -176,6 +187,17 @@ impl FileOpener for ParquetOpener {
// We'll also check this after every record batch we read,
// and if at some point we are able to prove we can prune the file
using just the file level statistics
// we can end the stream early.
+ if !constant_columns.is_empty() {
+ predicate = predicate
+ .map(|expr| {
+ if is_dynamic_physical_expr(&expr) {
Review Comment:
Why do we need this clause? What breaks if we remove it? I'd think that
rewriting the dynamic expression would work - it would try to rewrite it's
children, which shouldn't cause any issues. Once `snapshot` is called the
produces expression should have the remapped children.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]