jenkins-bot has submitted this change and it was merged.
Change subject: Follow up to 4d06e1ba
......................................................................
Follow up to 4d06e1ba
* @Inez noticed that ctrl-c sends SIGINT to all processes in the group,
causing workers to race to disconnect. This could put a worker in
the "disconnected" state, making `worker.disconnect()` throw and
preventing the timeout from being cleared. When it fires, we get a
"kill ESRCH".
* Cluster in node v0.8 doesn't have a `worker.kill()` method, so we use
the internal worker.process.kill instead. This is what node does
anyways and is safer than the `process.kill()` in the timeout, so we
use it for both.
* Finally, we prevent killed workers from starting back up on exit.
That's something that was missed in the service-runner port.
Change-Id: I61a8f034cde7d5ef97e3064683197fdc4e98e00a
---
M bin/server.js
1 file changed, 15 insertions(+), 7 deletions(-)
Approvals:
Subramanya Sastry: Looks good to me, approved
Mobrovac: Looks good to me, but someone else must approve
jenkins-bot: Verified
diff --git a/bin/server.js b/bin/server.js
index 2da9a61..a30d045 100755
--- a/bin/server.js
+++ b/bin/server.js
@@ -123,25 +123,29 @@
});
};
-var stopWorker = function(workerId) {
+var stopWorker = Promise.method(function(workerId) {
var worker = cluster.workers[workerId];
+ // ctrl-c sends SIGINT to all the workers, so make sure we haven't lost
+ // the race. In the disconnected state, `worker.disconnect()` throws
+ // an error, "channel closed".
+ if (worker.state === 'disconnected') { return; }
var p = new Promise(function(resolve) {
var timeout = setTimeout(function() {
//
https://nodejs.org/api/cluster.html#cluster_worker_kill_signal_sigterm
// `worker.kill()` wants `worker.state ===
'disconnected'`.
// If that doesn't happen shortly, escalate!
- process.kill(worker.process.pid, 'SIGKILL');
+ worker.process.kill('SIGKILL');
resolve();
}, 10 * 1000);
- worker.on('disconnect', function() {
- worker.kill('SIGKILL');
+ worker.once('disconnect', function() {
clearTimeout(timeout);
+ worker.process.kill('SIGKILL');
resolve();
});
});
worker.disconnect();
return p;
-};
+});
if (cluster.isMaster && argv.n > 0) {
// Master
@@ -187,8 +191,10 @@
spawn();
}
+ var shuttingDown = false;
+
cluster.on('exit', function(worker, code, signal) {
- if (!worker.suicide) {
+ if (!worker.suicide && !shuttingDown) {
var pid = worker.process.pid;
processLogger.log("warning", util.format("worker %s
died (%s), restarting.", pid, signal || code));
if (timer) { timer.count('worker.exit.' + (signal ||
code), ''); }
@@ -197,11 +203,12 @@
});
var shutdownMaster = function() {
+ shuttingDown = true;
processLogger.log('info', 'shutting down, killing workers');
Promise.map(Object.keys(cluster.workers),
stopWorker).then(function() {
processLogger.log('info', 'exiting');
process.exit(0);
- });
+ }).done();
};
process.on('SIGINT', shutdownMaster);
@@ -215,6 +222,7 @@
process.exit(0);
};
+ process.on('SIGINT', shutdownWorker);
process.on('SIGTERM', shutdownWorker);
process.on('disconnect', shutdownWorker);
--
To view, visit https://gerrit.wikimedia.org/r/268848
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I61a8f034cde7d5ef97e3064683197fdc4e98e00a
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Arlolra <[email protected]>
Gerrit-Reviewer: Arlolra <[email protected]>
Gerrit-Reviewer: Cscott <[email protected]>
Gerrit-Reviewer: GWicke <[email protected]>
Gerrit-Reviewer: Inez <[email protected]>
Gerrit-Reviewer: Mobrovac <[email protected]>
Gerrit-Reviewer: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits