alamb commented on code in PR #22353:
URL: https://github.com/apache/datafusion/pull/22353#discussion_r3275557700
##########
datafusion/core/tests/parquet/filter_pushdown.rs:
##########
@@ -577,9 +580,17 @@ impl<'a> TestCase<'a> {
match page_index_filtering_expected {
PageIndexFilteringExpected::None => {
+ let (page_index_rows_pruned, page_index_rows_matched) =
+ pruning_metrics_or_zero(&metrics,
"page_index_rows_pruned");
+ println!(" page_index_rows_pruned: {page_index_rows_pruned}");
Review Comment:
do we really need the println! in these tests?
##########
datafusion/core/tests/parquet/filter_pushdown.rs:
##########
@@ -591,29 +602,55 @@ impl<'a> TestCase<'a> {
}
}
-fn get_pruning_metrics(metrics: &MetricsSet, metric_name: &str) -> (usize,
usize) {
+fn pruning_metrics_or_zero(metrics: &MetricsSet, metric_name: &str) -> (usize,
usize) {
+ get_pruning_metrics(metrics, metric_name, true)
+}
+
+fn expect_pruning_metrics(metrics: &MetricsSet, metric_name: &str) -> (usize,
usize) {
+ get_pruning_metrics(metrics, metric_name, false)
+}
+
+fn get_pruning_metrics(
+ metrics: &MetricsSet,
+ metric_name: &str,
+ allow_missing: bool,
+) -> (usize, usize) {
match metrics.sum_by_name(metric_name) {
Some(MetricValue::PruningMetrics {
pruning_metrics, ..
}) => (pruning_metrics.pruned(), pruning_metrics.matched()),
Some(_) => {
panic!("Metric '{metric_name}' is not a pruning metric
in\n\n{metrics:#?}")
}
- None => panic!(
- "Expected metric not found. Looking for '{metric_name}'
in\n\n{metrics:#?}"
- ),
+ None if allow_missing => (0, 0),
+ None => panic!("Expected metric '{metric_name}' not found
in\n\n{metrics:#?}"),
}
}
-fn get_value(metrics: &MetricsSet, metric_name: &str) -> usize {
+fn metric_value_or_zero(metrics: &MetricsSet, metric_name: &str) -> usize {
Review Comment:
It might be clearer/more "rust idomatic" if you this used option instead
```rust
fn metric_value(metrics: &MetricsSet, metric_name: &str) -> Option<usize. {
```
And then change the callsites to
```rust
assert_eq!(metric_value(..), Some(..))
```
or
```rust
assert!(metric_value(..).is_none()
```
I think it would make the changes above easier to undesrtand as well as more
specifically test things (e.g. it could potentially protect against
regressions where the metrics are registered again)
##########
datafusion/datasource-parquet/src/metrics.rs:
##########
@@ -238,3 +204,372 @@ impl ParquetFileMetrics {
count.add(n);
}
}
+
+#[derive(Debug)]
+struct ParquetFileMetricRegistration {
Review Comment:
I think we need to add some comments explaining what this is doing and
**why** it exists
Specifically, I think it is important to point out that the cost of creating
metrics is non trivial (as there are several String allocations, etc) so that
this structure exists to postpone registering these metrics until we know they
are non zer
##########
datafusion/core/tests/sql/explain_analyze.rs:
##########
@@ -269,23 +305,16 @@ async fn explain_analyze_level() {
#[tokio::test]
async fn explain_analyze_level_datasource_parquet() {
- let table_name = "tpch_lineitem_small";
- let parquet_path = "tests/data/tpch_lineitem_small.parquet";
- let sql = format!("EXPLAIN ANALYZE SELECT * FROM {table_name}");
-
- // Register test parquet file into context
- let ctx = SessionContext::new();
- ctx.register_parquet(table_name, parquet_path,
ParquetReadOptions::default())
- .await
- .expect("register parquet table for explain analyze test");
+ let (ctx, _temp_dir) = page_pruning_parquet_context().await;
Review Comment:
maybe we can add the rationale as comments
##########
datafusion/core/tests/parquet/mod.rs:
##########
@@ -267,6 +267,20 @@ impl TestOutput {
}
}
+pub(crate) fn zero_if_metric_absent(
Review Comment:
as above, I recommend changing the tests to explicitly check `Some` / `None`
rather than trying to translate None --> zero. Since you already need to change
the callsites anyways to use this new function I think it would be easier to
undesrtand what was going on
This could be done as a follow on PR too
##########
datafusion/datasource-parquet/src/metrics.rs:
##########
@@ -103,114 +108,75 @@ impl ParquetFileMetrics {
// -----------------------
// 'summary' level metrics
// -----------------------
- let row_groups_pruned_bloom_filter = MetricBuilder::new(metrics)
- .with_new_label("filename", filename.to_string())
- .with_type(MetricType::Summary)
- .pruning_metrics("row_groups_pruned_bloom_filter", partition);
-
- let limit_pruned_row_groups = MetricBuilder::new(metrics)
- .with_new_label("filename", filename.to_string())
- .with_type(MetricType::Summary)
- .pruning_metrics("limit_pruned_row_groups", partition);
-
- let row_groups_pruned_statistics = MetricBuilder::new(metrics)
- .with_new_label("filename", filename.to_string())
- .with_type(MetricType::Summary)
- .pruning_metrics("row_groups_pruned_statistics", partition);
-
- let page_index_pages_pruned = MetricBuilder::new(metrics)
- .with_new_label("filename", filename.to_string())
- .with_type(MetricType::Summary)
- .pruning_metrics("page_index_pages_pruned", partition);
-
- let bytes_scanned = MetricBuilder::new(metrics)
- .with_new_label("filename", filename.to_string())
- .with_type(MetricType::Summary)
- .with_category(MetricCategory::Bytes)
- .counter("bytes_scanned", partition);
-
- let metadata_load_time = MetricBuilder::new(metrics)
- .with_new_label("filename", filename.to_string())
- .with_type(MetricType::Summary)
- .subset_time("metadata_load_time", partition);
-
- let files_ranges_pruned_statistics = MetricBuilder::new(metrics)
- .with_type(MetricType::Summary)
- .pruning_metrics("files_ranges_pruned_statistics", partition);
-
- let scan_efficiency_ratio = MetricBuilder::new(metrics)
- .with_new_label("filename", filename.to_string())
- .with_type(MetricType::Summary)
- .ratio_metrics_with_strategy(
- "scan_efficiency_ratio",
- partition,
- RatioMergeStrategy::AddPartSetTotal,
- );
+ let row_groups_pruned_bloom_filter = PruningMetrics::new();
+ let limit_pruned_row_groups = PruningMetrics::new();
+ let row_groups_pruned_statistics = PruningMetrics::new();
+ let page_index_pages_pruned = PruningMetrics::new();
+ let bytes_scanned = Count::new();
+ let metadata_load_time = Time::new();
+ let files_ranges_pruned_statistics = PruningMetrics::new();
+ let scan_efficiency_ratio =
+
RatioMetrics::new().with_merge_strategy(RatioMergeStrategy::AddPartSetTotal);
// -----------------------
// 'dev' level metrics
// -----------------------
- let predicate_evaluation_errors = MetricBuilder::new(metrics)
- .with_new_label("filename", filename.to_string())
Review Comment:
If I look at this, it seems to me like one challenge is that there are many
many new strings with the file name. Maybe it is worth looking into if we could
replace the owned string in Label with a Arc<str> (as another PR)
https://github.com/apache/datafusion/blob/3aefba75be1ec571849136de5f2d575e6c505b7b/datafusion/physical-expr-common/src/metrics/mod.rs#L526-L527
It might be tricky however, as we would have to retain the special case for
`&'static str` 🤔
Maybe we could make a new thing instead of Cow like
```rust
enum ArcCow {
/// Static string
Static(&'static str),
/// Shared arc
Shared(Arc<str>),
/// Owned allocation
Owned(String)
}
```
With some finagling (like `impl Into<ArcCow>` I suspect we could make it
mostly backwards compatible with the existing APIs / callsites 🤔
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]