Modified: trunk/Tools/ChangeLog (291044 => 291045)
--- trunk/Tools/ChangeLog 2022-03-09 13:29:13 UTC (rev 291044)
+++ trunk/Tools/ChangeLog 2022-03-09 13:31:01 UTC (rev 291045)
@@ -1,3 +1,49 @@
+2022-03-09 Angelos Oikonomopoulos <[email protected]>
+
+ [JSC] Make runner should only schedule tests on live remotes
+ https://bugs.webkit.org/show_bug.cgi?id=237030
+
+ Reviewed by Adrian Perez de Castro.
+
+ The make runner does static scheduling of tests, i.e. it will
+ assign tests to all known remotes. In practice, this means that
+ even though the make runner participates in the retry loop, it
+ will always end up scheduling any tests that failed to run
+ (e.g. because the remote was down on startup or went away during
+ testing) on all remotes, including ones that are known to be down.
+
+ This patch makes it slightly more robust by introducing a liveness
+ check (getLiveRemoteHosts) before the static scheduling, i.e.
+ 1. in the initial preparation of the tests to run (prepareArtifacts) and
+ 2. in refreshExecution.
+
+ It reuses the command to get the number of processors as the
+ liveness check (it's a command we know currenly works in all
+ supported remotes), so flips numberOfProcessors inside-out.
+ While here, it adds getconf _NPROCESSORS_ONLN as a final attempt
+ to get the number of processors.
+
+ This does not mean that the make runner is now robust against
+ flaky remotes. At the very least, it's still missing an ssh
+ timeout (so it won't hang e.g. when hosts go away after the
+ initial connection is established) and child cleanup (e.g. when
+ bailing early from one remote in forEachRemote, it needs to kill
+ all children for other remotes, or we might end up retrying while
+ the old jobs are still running). But it's getting there.
+
+ Q: But isn't the GNU parallel runner much better suited for
+ handling flaky remotes?
+
+ A: Yes. However, the make runner is (somewhat) better at keeping
+ the remote CPUs busy with "fast" remotes, so we may still want to
+ use it when the remotes are both fast and stable enough. That
+ said, it still needs to be able to recover from the occasional
+ failed remote without producing false test failures.
+
+ * Scripts/run-jsc-stress-tests:
+ * Scripts/webkitruby/jsc-stress-test/executor.rb:
+ Make remoteHosts available to refreshExecution.
+
2022-03-08 Jean-Yves Avenard <[email protected]>
Have MediaFormatReader plugin use WebMParser directly
Modified: trunk/Tools/Scripts/run-jsc-stress-tests (291044 => 291045)
--- trunk/Tools/Scripts/run-jsc-stress-tests 2022-03-09 13:29:13 UTC (rev 291044)
+++ trunk/Tools/Scripts/run-jsc-stress-tests 2022-03-09 13:31:01 UTC (rev 291045)
@@ -2589,17 +2589,21 @@
`#{cmd}`
end
-def numberOfProcessors
+def commandToGetNumberOfProcessors
if $hostOS == "windows"
- numProcessors = runCommandOnTester("cmd /c echo %NUMBER_OF_PROCESSORS%").to_i
+ "cmd /c echo %NUMBER_OF_PROCESSORS%"
else
- begin
- numProcessors = runCommandOnTester("sysctl -n hw.activecpu 2>/dev/null || nproc --all 2>/dev/null").to_i
- rescue
- numProcessors = 0
- end
+ "sysctl -n hw.activecpu 2>/dev/null || nproc --all 2>/dev/null || getconf _NPROCESSORS_ONLN"
end
+end
+def numberOfProcessors
+ begin
+ numProcessors = runCommandOnTester(commandToGetNumberOfProcessors).to_i
+ rescue
+ numProcessors = 0
+ end
+
if numProcessors == 0
$stderr.puts("Warning: could not determine the number of remote CPUs, defaulting to 1")
numProcessors = 1
@@ -2727,12 +2731,8 @@
if not remoteHosts.nil?
remoteHost = remoteHosts[remoteIndex]
getRemoteDirectoryIfNeeded(remoteHost)
- copyBundleToRemote(remoteHost)
remoteScript = "\""
- remoteScript += "cd #{remoteHost.remoteDirectory} && "
- remoteScript += "rm -rf #{$outputDir.basename} && "
- remoteScript += "tar xzf #{$tarFileName} && "
- remoteScript += "cd #{$outputDir.basename}/.runner && "
+ remoteScript += "cd #{remoteHost.remoteDirectory}/#{$outputDir.basename}/.runner && "
remoteScript += exportBaseEnvironmentVariables(true)
$envVars.each { |var| remoteScript += "export " << var << "\n" }
remoteScript += "#{testRunner.command(remoteIndex)}\""
@@ -3074,7 +3074,7 @@
}
end
-def unpackBundleGnuParallel(remoteHosts)
+def unpackBundleOnRemoteHosts(remoteHosts)
forEachRemote(remoteHosts, :dropOnFailure => true, :timeout => REMOTE_TIMEOUT) {
| _, remoteHost |
mysys(["ssh"] + SSH_OPTIONS_DEFAULT +
@@ -3085,6 +3085,19 @@
}
end
+def prepareRemotes(remoteHosts)
+ # If the preparatory steps fail, drop the remote host from our
+ # list. Otherwise, if it comes back online in the middle of an
+ # iteration, we'll try to run test jobs on it, possibly using
+ # an unrelated bundle from a previous run.
+ remoteHosts = forEachRemote(remoteHosts, {:dropOnFailure => true, :timeout => REMOTE_TIMEOUT}) {
+ | _, remoteHost |
+ getRemoteDirectoryIfNeeded(remoteHost)
+ copyBundleToRemote(remoteHost)
+ }
+ return unpackBundleOnRemoteHosts(remoteHosts)
+end
+
def runGnuParallelRunner(remoteHosts, inputs, options={})
timeout = 300
if ENV["JSCTEST_timeout"]
@@ -3158,11 +3171,11 @@
def prepareExecution(remoteHosts)
remoteHosts
end
- def refreshExecution(runlist, serialPlans, completedTests)
+ def refreshExecution(runlist, serialPlans, completedTests, remoteHosts)
if not @remoteHosts.nil?
raise "remoteHosts #{@remoteHosts}, expected nil"
end
- @testRunner.prepare(runlist, serialPlans, completedTests, @remoteHosts)
+ @testRunner.prepare(runlist, serialPlans, completedTests, remoteHosts)
end
end
@@ -3205,13 +3218,13 @@
def updateStatusMap(iteration, statusMap)
return getStatusMap(statusMap)
end
- def refreshExecution(runlist, serialPlans, completedTests)
+ def refreshExecution(runlist, serialPlans, completedTests, remoteHosts)
# Note, when running tests remotely, we always prepare tests
# for all the hosts when refreshing execution, as even hosts
# that went away during testing may have come back online in
# the meantime -- so we don't expect a list of live remote
# hosts to be passed in.
- @testRunner.prepare(runlist, serialPlans, completedTests, @remoteHosts)
+ @testRunner.prepare(runlist, serialPlans, completedTests, remoteHosts)
end
end
@@ -3219,18 +3232,44 @@
def prepareArtifacts(remoteHosts)
raise "remoteHosts nil" if remoteHosts.nil?
prepareBundle
+
+ # The make runner statically shards the tests, so do a
+ # liveness check before @testRunner.prepare. Otherwise, we'd
+ # always end up scheduling tests on dead remotes and hamper
+ # our progress.
+ remoteHosts = getLiveRemoteHosts(remoteHosts)
@testRunner.prepare(@runlist, @serialPlans, Set.new, remoteHosts)
compressBundle
+ remoteHosts
end
def prepareExecution(remoteHosts)
- remoteHosts
+ return prepareRemotes(remoteHosts)
end
+ def refreshExecution(runlist, serialPlans, completedTests, remoteHosts)
+ # Don't assign tests to remotes that are down.
+ remoteHosts = getLiveRemoteHosts(remoteHosts)
+ super(runlist, serialPlans, completedTests, remoteHosts)
+ end
def executeTests(remoteHosts)
- forEachRemote(remoteHosts) {
+ forEachRemote(remoteHosts, :dropOnFailure => true) {
| index |
runTestRunner(@testRunner, remoteHosts, index)
}
end
+ private
+ def getLiveRemoteHosts(remoteHosts)
+ # This command is supposed to work on all remote hosts, so
+ # reuse it for the liveness check.
+ cmd = commandToGetNumberOfProcessors
+ forEachRemote(remoteHosts, :dropOnFailure => true, :timeout => REMOTE_TIMEOUT) { |_, remoteHost|
+ begin
+ sshRead(cmd, remoteHost)
+ rescue
+ # It's what forEachRemote considers an expected failure.
+ raise CommandExecutionFailed
+ end
+ }
+ end
end
class GnuParallelTestsExecutor < BaseRemoteTestsExecutor
@@ -3240,16 +3279,7 @@
compressBundle
end
def prepareExecution(remoteHosts)
- # If the preparatory steps fail, drop the remote host from our
- # list. Otherwise, if it comes back online in the middle of an
- # iteration, we'll try to run test jobs on it, possibly using
- # an unrelated bundle from a previous run.
- remoteHosts = forEachRemote(remoteHosts, {:dropOnFailure => true, :timeout => REMOTE_TIMEOUT}) {
- | _, remoteHost |
- getRemoteDirectoryIfNeeded(remoteHost)
- copyBundleToRemote(remoteHost)
- }
- return unpackBundleGnuParallel(remoteHosts)
+ return prepareRemotes(remoteHosts)
end
def executeTests(remoteHosts)
runGnuParallelRunner(remoteHosts, $runnerDir + "serial-tests",
Modified: trunk/Tools/Scripts/webkitruby/jsc-stress-test/executor.rb (291044 => 291045)
--- trunk/Tools/Scripts/webkitruby/jsc-stress-test/executor.rb 2022-03-09 13:29:13 UTC (rev 291044)
+++ trunk/Tools/Scripts/webkitruby/jsc-stress-test/executor.rb 2022-03-09 13:31:01 UTC (rev 291045)
@@ -154,7 +154,7 @@
end
# Regenerate the lists of tests to run
- refreshExecution(@runlist, @serialPlans, completedTests)
+ refreshExecution(@runlist, @serialPlans, completedTests, @remoteHosts)
numFlaky = @runlist.size - completedTests.size - evaluator.missing.size
testsInfo = "completed #{completedTests.size}/#{@runlist.size} (#{numFlaky} flaky, #{evaluator.missing.size} missing)"
@@ -209,7 +209,7 @@
putd("statusMap: #{statusMap}")
statusMap
end
- def refreshExecution(runlist, serialPlans, completedTests)
+ def refreshExecution(runlist, serialPlans, completedTests, remoteHosts)
end
end