seancxmao commented on a change in pull request #23258: 
[SPARK-23375][SQL][FOLLOWUP][TEST] Test Sort metrics while Sort is missing
URL: https://github.com/apache/spark/pull/23258#discussion_r244085228
 
 

 ##########
 File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala
 ##########
 @@ -198,6 +199,52 @@ trait SQLMetricsTestUtils extends SQLTestUtils {
       }
     }
   }
+
+  private def metricStats(metricStr: String): Seq[String] = {
+    val sum = metricStr.substring(0, 
metricStr.indexOf("(")).stripPrefix("\n").stripSuffix(" ")
+    val minMedMax = metricStr.substring(metricStr.indexOf("(") + 1, 
metricStr.indexOf(")"))
+      .split(", ").toSeq
+    (sum +: minMedMax)
+  }
+
+  private def stringToBytes(str: String): (Float, String) = {
+    val matcher =
+      Pattern.compile("([0-9]+(\\.[0-9]+)?) 
(EiB|PiB|TiB|GiB|MiB|KiB|B)").matcher(str)
+    if (matcher.matches()) {
+      (matcher.group(1).toFloat, matcher.group(3))
+    } else {
+      throw new NumberFormatException("Failed to parse byte string: " + str)
+    }
+  }
+
+  private def stringToDuration(str: String): (Float, String) = {
+    val matcher = Pattern.compile("([0-9]+(\\.[0-9]+)?) 
(ms|s|m|h)").matcher(str)
+    if (matcher.matches()) {
+      (matcher.group(1).toFloat, matcher.group(3))
+    } else {
+      throw new NumberFormatException("Failed to parse time string: " + str)
+    }
+  }
+
+  /**
+   * Convert a size metric string to a sequence of stats, including sum, min, 
med and max in order,
+   * each a tuple of (value, unit).
+   * @param metricStr size metric string, e.g. "\n96.2 MB (32.1 MB, 32.1 MB, 
32.1 MB)"
+   * @return A sequence of stats, e.g. ((96.2,MB), (32.1,MB), (32.1,MB), 
(32.1,MB))
+   */
+  protected def sizeMetricStats(metricStr: String): Seq[(Float, String)] = {
+    metricStats(metricStr).map(stringToBytes)
+  }
+
+  /**
+   * Convert a timing metric string to a sequence of stats, including sum, 
min, med and max in
+   * order, each a tuple of (value, unit).
+   * @param metricStr timing metric string, e.g. "\n2.0 ms (1.0 ms, 1.0 ms, 
1.0 ms)"
+   * @return A sequence of stats, e.g. ((2.0,ms), (1.0,ms), (1.0,ms), (1.0,ms))
+   */
+  protected def timingMetricStats(metricStr: String): Seq[(Float, String)] = {
+    metricStats(metricStr).map(stringToDuration)
+  }
 
 Review comment:
   Yes, currently these functions are only used for `test("Sort metrics")`. 
What `SQLMetricsSuite` has been checking are almost all integer number metrics 
(e.g. "number of output rows", "records read", ...). However we should also 
check non-integer metrics, such as timing metric and size metric. These metrics 
are in the same format of `"total (min, med, max)"`. These help functions could 
be used to check all these metrics. Please see the screenshot I posted above to 
see more timing or size metric examples (shuffle write, shuffle read, ...).
   
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to