EmilyMatt commented on code in PR #2615:
URL: https://github.com/apache/datafusion-comet/pull/2615#discussion_r2450900855


##########
spark/src/main/scala/org/apache/spark/sql/comet/CometMetricNode.scala:
##########
@@ -107,6 +107,94 @@ object CometMetricNode {
         SQLMetrics.createNanoTimingMetric(sc, "Total time for casting 
columns"))
   }
 
+  def parquetScanMetrics(sc: SparkContext): Map[String, SQLMetric] = {
+    Map(
+      "numOutputRows" -> SQLMetrics.createMetric(sc, "number of output rows"),
+      "scanTime" -> SQLMetrics.createNanoTimingMetric(sc, "scan time"),
+      "ParquetRowGroups" -> SQLMetrics.createMetric(sc, "num of Parquet row 
groups read"),
+      "ParquetNativeDecodeTime" -> SQLMetrics.createNanoTimingMetric(
+        sc,
+        "time spent in Parquet native decoding"),
+      "ParquetNativeLoadTime" -> SQLMetrics.createNanoTimingMetric(
+        sc,
+        "time spent in loading Parquet native vectors"),
+      "ParquetLoadRowGroupTime" -> SQLMetrics.createNanoTimingMetric(
+        sc,
+        "time spent in loading Parquet row groups"),
+      "ParquetInputFileReadTime" -> SQLMetrics.createNanoTimingMetric(
+        sc,
+        "time spent in reading Parquet file from storage"),
+      "ParquetInputFileReadSize" -> SQLMetrics.createSizeMetric(
+        sc,
+        "read size when reading Parquet file from storage (MB)"),
+      "ParquetInputFileReadThroughput" -> SQLMetrics.createAverageMetric(
+        sc,
+        "read throughput when reading Parquet file from storage (MB/sec)"))
+  }
+
+  def nativeScanMetrics(sc: SparkContext): Map[String, SQLMetric] = {
+    Map(
+      "output_rows" -> SQLMetrics.createMetric(sc, "number of output rows"),

Review Comment:
   Yes I think we may need to add a numOutputRows metric as well, otherwise 
some streaming tests fail, I'll make it point to the same metric, as the native 
scan reports "output_rows"



##########
spark/src/main/scala/org/apache/spark/sql/comet/CometMetricNode.scala:
##########
@@ -107,6 +107,94 @@ object CometMetricNode {
         SQLMetrics.createNanoTimingMetric(sc, "Total time for casting 
columns"))
   }
 
+  def parquetScanMetrics(sc: SparkContext): Map[String, SQLMetric] = {
+    Map(
+      "numOutputRows" -> SQLMetrics.createMetric(sc, "number of output rows"),
+      "scanTime" -> SQLMetrics.createNanoTimingMetric(sc, "scan time"),
+      "ParquetRowGroups" -> SQLMetrics.createMetric(sc, "num of Parquet row 
groups read"),
+      "ParquetNativeDecodeTime" -> SQLMetrics.createNanoTimingMetric(
+        sc,
+        "time spent in Parquet native decoding"),
+      "ParquetNativeLoadTime" -> SQLMetrics.createNanoTimingMetric(
+        sc,
+        "time spent in loading Parquet native vectors"),
+      "ParquetLoadRowGroupTime" -> SQLMetrics.createNanoTimingMetric(
+        sc,
+        "time spent in loading Parquet row groups"),
+      "ParquetInputFileReadTime" -> SQLMetrics.createNanoTimingMetric(
+        sc,
+        "time spent in reading Parquet file from storage"),
+      "ParquetInputFileReadSize" -> SQLMetrics.createSizeMetric(
+        sc,
+        "read size when reading Parquet file from storage (MB)"),
+      "ParquetInputFileReadThroughput" -> SQLMetrics.createAverageMetric(
+        sc,
+        "read throughput when reading Parquet file from storage (MB/sec)"))
+  }
+
+  def nativeScanMetrics(sc: SparkContext): Map[String, SQLMetric] = {
+    Map(
+      "output_rows" -> SQLMetrics.createMetric(sc, "number of output rows"),

Review Comment:
   Testing this



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to