Should the licensing daemon somehow crash, the watchdog will automatically 
restart it.

Signed-off-by: Rob Hoes <[email protected]>


 ocaml/license/v6client.ml |   22 +++++-
 ocaml/license/v6daemon.ml |  147 
+++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 146 insertions(+), 23 deletions(-)


# HG changeset patch
# User Rob Hoes <[email protected]>
# Date 1267012968 0
# Node ID 64d19cc82053e494ed647895994fc56f88c52b90
# Parent  4698cec6d4eaba7c7082ff40f591b579687f08ec
CA-37939: Add watchdog to v6d, and retry functionality to v6client in xapi.

Should the licensing daemon somehow crash, the watchdog will automatically restart it.

Signed-off-by: Rob Hoes <[email protected]>

diff -r 4698cec6d4ea -r 64d19cc82053 ocaml/license/v6client.ml
--- a/ocaml/license/v6client.ml	Wed Feb 24 11:57:56 2010 +0000
+++ b/ocaml/license/v6client.ml	Wed Feb 24 12:02:48 2010 +0000
@@ -16,7 +16,6 @@
 open D
 
 exception V6DaemonFailure
-exception Unmarshalling_error of string
 
 (* define "never" as 01-01-2030 *)
 let start_of_epoch = Unix.gmtime 0.
@@ -27,6 +26,7 @@
 let licensed = ref None
 let expires = ref never
 let grace = ref false
+let retry = ref true
 
 let socket = "/var/xapi/v6"
 
@@ -87,7 +87,10 @@
 	else begin
 		try
 			let myassoc key args =
-				try List.assoc key args with Not_found -> raise (Unmarshalling_error key)
+				try List.assoc key args
+				with Not_found ->
+					error "key %s not found in v6d's response" key;
+					raise V6DaemonFailure
 			in
 			let get_named_string name args = XMLRPC.From.string (myassoc name args) in
 			let get_named_int name args = XMLRPC.From.int (myassoc name args) in
@@ -140,12 +143,12 @@
 		| Unix.Unix_error(a, b, c) ->
 			error "Problem while initialising (%s): %s" b (Unix.error_message a);
 			raise V6DaemonFailure
-		| V6DaemonFailure ->
+		| V6DaemonFailure | _ ->
 			warn "Did not get a proper response from the v6 licensing daemon!";
 			raise V6DaemonFailure
 	end
 	
-let get_v6_license ~__context ~host ~edition =
+let rec get_v6_license ~__context ~host ~edition =
 	try
 		let ls = Db.Host.get_license_server ~__context ~self:host in
 		let address = List.assoc "address" ls in
@@ -155,7 +158,16 @@
 		connect_and_get_license edition address port
 	with
 	| Not_found -> failwith "Missing connection details"
-	| V6DaemonFailure -> reset_state ()
+	| V6DaemonFailure ->
+		reset_state ();
+		if !retry then begin
+			error "Checkout failed. Retrying once...";
+			retry := false;
+			Thread.delay 2.;
+			get_v6_license ~__context ~host ~edition
+		end else
+			error "Checkout failed.";
+			retry := true
 	
 let release_v6_license () =
 	try
diff -r 4698cec6d4ea -r 64d19cc82053 ocaml/license/v6daemon.ml
--- a/ocaml/license/v6daemon.ml	Wed Feb 24 11:57:56 2010 +0000
+++ b/ocaml/license/v6daemon.ml	Wed Feb 24 12:02:48 2010 +0000
@@ -14,10 +14,13 @@
 
 (* v6 licensing daemon *)
 open Stringext
+open Printf
 
 module D=Debug.Debugger(struct let name="v6daemon" end)
 open D
 
+module W=Debug.Debugger(struct let name="watchdog" end)
+
 let xmlrpc_handler process req bio =
 	let path = match String.split '/' req.Http.uri with
 	| x::path::_ -> path
@@ -32,6 +35,131 @@
 	let str = Xml.to_string result in
 	debug "Response: %s" str;
 	Http_svr.response_str req s str
+	
+
+let daemon_init post_daemonize_hook process =
+	post_daemonize_hook ();
+	
+	(* unix socket *)
+	let unix_socket_path = "/var/xapi/v6" in
+	Unixext.mkdir_safe (Filename.dirname unix_socket_path) 0o700;
+	Unixext.unlink_safe unix_socket_path;
+	let domain_sock = Http_svr.bind (Unix.ADDR_UNIX(unix_socket_path)) in
+	ignore(Http_svr.start (domain_sock, "unix-RPC"));
+	Http_svr.add_handler Http.Post "/" (Http_svr.BufIO (xmlrpc_handler process));
+
+	(* TCP socket: only use for testing! *)
+(*	let localhost = Unix.inet_addr_of_string "127.0.0.1" in
+	let localhost_sock = Http_svr.bind (Unix.ADDR_INET(localhost, 4094)) in
+	Unix.setsockopt localhost_sock Unix.SO_REUSEADDR true;
+	ignore(Http_svr.start (localhost_sock, "inet-RPC"));*)
+
+	(* keep daemon alive *)
+	Threadext.keep_alive ()
+
+
+let watchdog f =
+	(* parent process blocks sigint and forward sigterm to child. *)
+	ignore(Unix.sigprocmask Unix.SIG_BLOCK [Sys.sigint]);
+	Sys.catch_break false;
+	Logs.append "watchdog" Log.Info "syslog:v6d_watchdog";
+
+	(* watchdog logic *)
+	let loginfo fmt = W.info fmt in
+
+	let restart = ref true
+	and error_msg = ref "" and exit_code = ref 0
+	and last_badsig = ref (0.) and pid = ref 0
+	and last_badexit = ref (0.) and no_retry_interval = 10. in
+
+	while !restart
+	do
+		begin
+			loginfo "(Re)starting v6d...";
+			if !pid = 0 then
+				begin
+					let newpid = Unix.fork () in
+					if newpid = 0 then
+						begin
+							try
+								ignore(Unix.sigprocmask Unix.SIG_UNBLOCK [Sys.sigint]);
+								f ();
+								exit 127
+							with e ->
+								error "Caught exception at toplevel: '%s'" (Printexc.to_string e);
+								log_backtrace ();
+								raise e (* will exit the process with rc=2 *)
+						end;
+					(* parent just reset the sighandler *)
+					Sys.set_signal Sys.sigterm (Sys.Signal_handle (fun i -> restart := false; Unix.kill newpid Sys.sigterm));
+					pid := newpid
+				end;
+			try
+				(* remove the pid in all case, except stop *)
+				match snd (Unix.waitpid [] !pid) with
+				| Unix.WEXITED 0 ->
+					loginfo "Received exit code 0. Not restarting.";
+					pid := 0;
+					restart := false;
+					error_msg := "";
+				| Unix.WEXITED i ->
+					loginfo "Received exit code %d" i;
+					exit_code := i;
+					pid := 0;
+					let ctime = Unix.time () in
+					if ctime < (!last_badexit +. no_retry_interval) then
+						begin
+							restart := false;
+							loginfo "Received 2 bad exits within no-retry-interval. Giving up.";
+						end
+					else
+						begin
+							(* restart := true; -- don't need to do this - it's true already *)
+							loginfo "Received bad exit, retrying";
+							last_badexit := ctime
+						end
+				| Unix.WSIGNALED i ->
+					loginfo "Received signal %d" i;
+					pid := 0;
+					(* arbitrary choice of signals, probably need more though, for real use *)
+					if i = Sys.sigsegv || i = Sys.sigpipe then
+						begin
+							let ctime = Unix.time () in
+							if ctime < (!last_badsig +. no_retry_interval) then
+								begin
+									restart := false;
+									error_msg := sprintf "v6d died with signal %d: not restarting (2 bad signals within no_retry_interval)" i;
+									exit_code := 13
+								end
+							else
+								begin
+									loginfo "v6d died with signal %d: restarting" i;
+									last_badsig := ctime
+								end
+						end
+					else
+						begin
+							restart := false;
+							error_msg := sprintf "v6d died with signal %d: not restarting (watchdog never restarts on this signal)" i;
+							exit_code := 12
+						end
+				| Unix.WSTOPPED i ->
+					loginfo "Receive stop code %i" i;
+					Unix.sleep 1;
+					(* well, just resume the stop process. the watchdog cannot do anything if the process is stopped *)
+					Unix.kill !pid Sys.sigcont;
+			with
+			| Unix.Unix_error(Unix.EINTR,_,_) -> ()
+			| e -> loginfo "Watchdog received unexpected exception: %s" (Printexc.to_string e)
+		end;
+	done;
+	if !error_msg <> "" then
+		begin
+			loginfo "v6d watchdog exiting.";
+			loginfo "Fatal: %s" !error_msg;
+			eprintf "%s\n" !error_msg;
+		end;
+	exit !exit_code		
 
 
 let daemon = ref false
@@ -51,22 +179,5 @@
 	if !pidfile <> "" then
 		Unixext.pidfile_write !pidfile;
 
-	post_daemonize_hook ();
+	watchdog (fun () -> daemon_init post_daemonize_hook process)
 	
-	(* unix socket *)
-	let unix_socket_path = "/var/xapi/v6" in
-	Unixext.mkdir_safe (Filename.dirname unix_socket_path) 0o700;
-	Unixext.unlink_safe unix_socket_path;
-	let domain_sock = Http_svr.bind (Unix.ADDR_UNIX(unix_socket_path)) in
-	ignore(Http_svr.start (domain_sock, "unix-RPC"));
-	Http_svr.add_handler Http.Post "/" (Http_svr.BufIO (xmlrpc_handler process));
-
-	(* TCP socket: only use for testing! *)
-(*	let localhost = Unix.inet_addr_of_string "127.0.0.1" in
-	let localhost_sock = Http_svr.bind (Unix.ADDR_INET(localhost, 4094)) in
-	Unix.setsockopt localhost_sock Unix.SO_REUSEADDR true;
-	ignore(Http_svr.start (localhost_sock, "inet-RPC"));*)
-
-	(* keep daemon alive *)
-	Threadext.keep_alive ()
-	
_______________________________________________
xen-api mailing list
[email protected]
http://lists.xensource.com/mailman/listinfo/xen-api

Reply via email to