Modified: trunk/Tools/ChangeLog (242657 => 242658)
--- trunk/Tools/ChangeLog 2019-03-08 23:19:27 UTC (rev 242657)
+++ trunk/Tools/ChangeLog 2019-03-08 23:20:15 UTC (rev 242658)
@@ -1,3 +1,29 @@
+2019-03-08 Saam barati <[email protected]>
+
+ Add a compare-results script to compare benchmark results
+ https://bugs.webkit.org/show_bug.cgi?id=195486
+ <rdar://problem/48723397>
+
+ Reviewed by Geoffrey Garen.
+
+ This patch adds a script to compare benchmark results using Welch's two-tailed t test.
+ Initially, this patch only reasons about PLT5/JetStream2/Speedometer2. It will be easy
+ to extend it to learn about our other benchmarks.
+
+ * Scripts/compare-results: Added.
+ (readJSONFile):
+ (detectJetStream2):
+ (JetStream2Results):
+ (detectSpeedometer2):
+ (Speedometer2Results):
+ (detectPLT5):
+ (PLT5Results):
+ (detectBenchmark):
+ (biggerIsBetter):
+ (ttest):
+ (getOptions):
+ (main):
+
2019-03-08 Stephanie Lewis <[email protected]>
Ensure old tab state is cleared between iterations of run-benchmark
Added: trunk/Tools/Scripts/compare-results (0 => 242658)
--- trunk/Tools/Scripts/compare-results (rev 0)
+++ trunk/Tools/Scripts/compare-results 2019-03-08 23:20:15 UTC (rev 242658)
@@ -0,0 +1,198 @@
+#!/usr/bin/env python -u
+
+# Copyright (C) 2019 Apple Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. Neither the name of Apple Inc. ("Apple") nor the names of
+# its contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import sys
+import argparse
+import json
+
+try:
+ from scipy import stats
+except:
+ print "ERROR: scipy package is not installed. Run `pip install scipy`"
+ sys.exit(1)
+
+try:
+ import numpy
+except:
+ print "ERROR: numpy package is not installed. Run `pip install numpy`"
+ sys.exit(1)
+
+def readJSONFile(path):
+ with open(path, 'r') as contents:
+ return json.loads(contents.read())
+
+Speedometer2 = "Speedometer2"
+JetStream2 = "JetStream2"
+PLT5 = "PLT5"
+
+def detectJetStream2(payload):
+ return "JetStream2.0" in payload
+
+def JetStream2Results(payload):
+ assert detectJetStream2(payload)
+
+ js = payload["JetStream2.0"]
+ iterations = len(js["tests"]["gaussian-blur"]["metrics"]["Score"]["current"])
+ results = []
+ for i in range(iterations):
+ scores = []
+ for test in js["tests"].keys():
+ scores.append(js["tests"][test]["metrics"]["Score"]["current"][i])
+ geomean = stats.gmean(scores)
+
+ results.append(geomean)
+
+ return results
+
+def detectSpeedometer2(payload):
+ return "Speedometer-2" in payload
+
+def Speedometer2Results(payload):
+ assert detectSpeedometer2(payload)
+ results = []
+ for arr in payload["Speedometer-2"]["metrics"]["Score"]["current"]:
+ results.append(numpy.mean(arr))
+ return results
+
+def detectPLT5(payload):
+ if "iterations" not in payload:
+ return False
+ iterations = payload["iterations"]
+ if not isinstance(iterations, list):
+ return False
+ if not len(iterations):
+ return False
+ if "cold" not in iterations[0]:
+ return False
+ if "warm" not in iterations[0]:
+ return False
+ if "Geometric" not in iterations[0]:
+ return False
+ return True
+
+def PLT5Results(payload):
+ assert detectPLT5(payload)
+ results = []
+ for obj in payload["iterations"]:
+ results.append(obj["Geometric"])
+ return results
+
+def detectBenchmark(payload):
+ if detectJetStream2(payload):
+ return JetStream2
+ if detectSpeedometer2(payload):
+ return Speedometer2
+ if detectPLT5(payload):
+ return PLT5
+ return None
+
+def biggerIsBetter(benchmarkType):
+ if benchmarkType == JetStream2:
+ return True
+ if benchmarkType == Speedometer2:
+ return True
+ if benchmarkType == PLT5:
+ return False
+
+ print "Should not be reached."
+ assert False
+
+def ttest(benchmarkType, a, b):
+ # We use two-tailed Welch's
+ (tStatistic, pValue) = stats.ttest_ind(a, b, equal_var=False)
+ aMean = numpy.mean(a)
+ bMean = numpy.mean(b)
+ print "a mean = {:.5f}".format(aMean)
+ print "b mean = {:.5f}".format(bMean)
+
+ print "pValue = {:.10f}".format(pValue)
+
+ if biggerIsBetter(benchmarkType):
+ print "(Bigger means are better.)"
+ if aMean > bMean:
+ print "{:.3f} times worse".format((aMean / bMean))
+ else:
+ print "{:.3f} times better".format((bMean / aMean))
+ else:
+ print "(Smaller means are better.)"
+ if aMean > bMean:
+ print "{:.3f} times better".format((aMean / bMean))
+ else:
+ print "{:.3f} times worse".format((bMean / aMean))
+
+ if pValue <= 0.05:
+ print "Results ARE significant"
+ else:
+ print "Results ARE NOT significant"
+
+def getOptions():
+ parser = argparse.ArgumentParser(description="Compare two WebKit benchmark results. Pass in two JSON result files to compare them. This script prints the pValue along with the magnitude of the change.")
+
+ parser.add_argument("-a",
+ type=str,
+ required=True,
+ help="a of a/b. Path to JSON results file.")
+
+ parser.add_argument("-b",
+ type=str,
+ required=True,
+ help="b of a/b. Path to JSON results file.")
+
+ return parser.parse_known_args()[0]
+
+
+def main():
+ args = getOptions()
+
+ a = readJSONFile(args.a)
+ b = readJSONFile(args.b)
+
+ typeA = detectBenchmark(a)
+ typeB = detectBenchmark(b)
+
+ if typeA != typeB:
+ print "-a and -b are not the same benchmark. a={} b={}".format(typeA, typeB)
+ sys.exit(1)
+
+ if not (typeA and typeB):
+ print "Unknown benchmark type. a={} b={}".format(typeA, typeB)
+ sys.exit(1)
+
+ if typeA == JetStream2:
+ ttest(typeA, JetStream2Results(a), JetStream2Results(b))
+ elif typeA == Speedometer2:
+ ttest(typeA, Speedometer2Results(a), Speedometer2Results(b))
+ elif typeA == PLT5:
+ ttest(typeA, PLT5Results(a), PLT5Results(b))
+ else:
+ print "Unknown benchmark type"
+ sys.exit(1)
+
+if __name__ == "__main__":
+ main()
+