EmilyMatt commented on code in PR #2615:
URL: https://github.com/apache/datafusion-comet/pull/2615#discussion_r2450900855
##########
spark/src/main/scala/org/apache/spark/sql/comet/CometMetricNode.scala:
##########
@@ -107,6 +107,94 @@ object CometMetricNode {
SQLMetrics.createNanoTimingMetric(sc, "Total time for casting
columns"))
}
+ def parquetScanMetrics(sc: SparkContext): Map[String, SQLMetric] = {
+ Map(
+ "numOutputRows" -> SQLMetrics.createMetric(sc, "number of output rows"),
+ "scanTime" -> SQLMetrics.createNanoTimingMetric(sc, "scan time"),
+ "ParquetRowGroups" -> SQLMetrics.createMetric(sc, "num of Parquet row
groups read"),
+ "ParquetNativeDecodeTime" -> SQLMetrics.createNanoTimingMetric(
+ sc,
+ "time spent in Parquet native decoding"),
+ "ParquetNativeLoadTime" -> SQLMetrics.createNanoTimingMetric(
+ sc,
+ "time spent in loading Parquet native vectors"),
+ "ParquetLoadRowGroupTime" -> SQLMetrics.createNanoTimingMetric(
+ sc,
+ "time spent in loading Parquet row groups"),
+ "ParquetInputFileReadTime" -> SQLMetrics.createNanoTimingMetric(
+ sc,
+ "time spent in reading Parquet file from storage"),
+ "ParquetInputFileReadSize" -> SQLMetrics.createSizeMetric(
+ sc,
+ "read size when reading Parquet file from storage (MB)"),
+ "ParquetInputFileReadThroughput" -> SQLMetrics.createAverageMetric(
+ sc,
+ "read throughput when reading Parquet file from storage (MB/sec)"))
+ }
+
+ def nativeScanMetrics(sc: SparkContext): Map[String, SQLMetric] = {
+ Map(
+ "output_rows" -> SQLMetrics.createMetric(sc, "number of output rows"),
Review Comment:
Yes I think we may need to add a numOutputRows metric as well, otherwise
some streaming tests fail, I'll make it point to the same metric, as the native
scan reports "output_rows"
##########
spark/src/main/scala/org/apache/spark/sql/comet/CometMetricNode.scala:
##########
@@ -107,6 +107,94 @@ object CometMetricNode {
SQLMetrics.createNanoTimingMetric(sc, "Total time for casting
columns"))
}
+ def parquetScanMetrics(sc: SparkContext): Map[String, SQLMetric] = {
+ Map(
+ "numOutputRows" -> SQLMetrics.createMetric(sc, "number of output rows"),
+ "scanTime" -> SQLMetrics.createNanoTimingMetric(sc, "scan time"),
+ "ParquetRowGroups" -> SQLMetrics.createMetric(sc, "num of Parquet row
groups read"),
+ "ParquetNativeDecodeTime" -> SQLMetrics.createNanoTimingMetric(
+ sc,
+ "time spent in Parquet native decoding"),
+ "ParquetNativeLoadTime" -> SQLMetrics.createNanoTimingMetric(
+ sc,
+ "time spent in loading Parquet native vectors"),
+ "ParquetLoadRowGroupTime" -> SQLMetrics.createNanoTimingMetric(
+ sc,
+ "time spent in loading Parquet row groups"),
+ "ParquetInputFileReadTime" -> SQLMetrics.createNanoTimingMetric(
+ sc,
+ "time spent in reading Parquet file from storage"),
+ "ParquetInputFileReadSize" -> SQLMetrics.createSizeMetric(
+ sc,
+ "read size when reading Parquet file from storage (MB)"),
+ "ParquetInputFileReadThroughput" -> SQLMetrics.createAverageMetric(
+ sc,
+ "read throughput when reading Parquet file from storage (MB/sec)"))
+ }
+
+ def nativeScanMetrics(sc: SparkContext): Map[String, SQLMetric] = {
+ Map(
+ "output_rows" -> SQLMetrics.createMetric(sc, "number of output rows"),
Review Comment:
Testing this
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]