>From 59d96d7b8cacf6531515f91703cb00c92bcfe51e Mon Sep 17 00:00:00 2001
From: pradeep <[email protected]>
Date: Thu, 7 Jul 2011 18:07:35 +0530
Subject: [PATCH] [Autotest] [PATCH ] Virt: Adding softlockup subtest
Signed-off-by: pradeep <[email protected]>
new file: client/tests/kvm/deps/heartbeat_slu.py
modified: client/tests/kvm/tests_base.cfg.sample
new file: client/virt/tests/softlockup.py
---
client/tests/kvm/deps/heartbeat_slu.py | 205 ++++++++++++++++++++++++++++++++
client/tests/kvm/tests_base.cfg.sample | 14 ++
client/virt/tests/softlockup.py | 88 ++++++++++++++
3 files changed, 307 insertions(+), 0 deletions(-)
create mode 100644 client/tests/kvm/deps/heartbeat_slu.py
create mode 100644 client/virt/tests/softlockup.py
diff --git a/client/tests/kvm/deps/heartbeat_slu.py
b/client/tests/kvm/deps/heartbeat_slu.py
new file mode 100644
index 0000000..697bbbf
--- /dev/null
+++ b/client/tests/kvm/deps/heartbeat_slu.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+
+"""
+Heartbeat server/client to detect soft lockups
+"""
+
+import socket, os, sys, time, getopt
+
+def daemonize(output_file):
+ try:
+ pid = os.fork()
+ except OSError, e:
+ raise Exception, "error %d: %s" % (e.strerror, e.errno)
+
+ if pid:
+ os._exit(0)
+
+ os.umask(0)
+ os.setsid()
+ sys.stdout.flush()
+ sys.stderr.flush()
+
+ if file:
+ output_handle = file(output_file, 'a+', 0)
+ # autoflush stdout/stderr
+ sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
+ sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', 0)
+ else:
+ output_handle = file('/dev/null', 'a+')
+
+ stdin_handle = open('/dev/null', 'r')
+ os.dup2(output_handle.fileno(), sys.stdout.fileno())
+ os.dup2(output_handle.fileno(), sys.stderr.fileno())
+ os.dup2(stdin_handle.fileno(), sys.stdin.fileno())
+
+def recv_all(sock):
+ total_data = []
+ while True:
+ data = sock.recv(1024)
+ if not data:
+ break
+ total_data.append(data)
+ return ''.join(total_data)
+
+def run_server(host, port, daemon, file, queue_size, threshold, drift):
+ if daemon:
+ daemonize(output_file=file)
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ sock.bind((host, port))
+ sock.listen(queue_size)
+ timeout_interval = threshold * 2
+ prev_check_timestamp = float(time.time())
+ while 1:
+ c_sock, c_addr = sock.accept()
+ heartbeat = recv_all(c_sock)
+ local_timestamp = float(time.time())
+ drift = check_heartbeat(heartbeat, local_timestamp, threshold,
check_drift)
+ # NOTE: this doesn't work if the only client is the one that timed
+ # out, but anything more complete would require another thread and
+ # a lock for client_prev_timestamp.
+ if local_timestamp - prev_check_timestamp > threshold * 2.0:
+ check_for_timeouts(threshold, check_drift)
+ prev_check_timestamp = local_timestamp
+ if verbose:
+ if check_drift:
+ print "%.2f: %s (%s)" % (local_timestamp, heartbeat, drift)
+ else:
+ print "%.2f: %s" % (local_timestamp, heartbeat)
+
+def run_client(host, port, daemon, file, interval):
+ if daemon:
+ daemonize(output_file=file)
+ seq = 1
+ while 1:
+ try:
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ sock.connect((host, port))
+ heartbeat = get_heartbeat(seq)
+ sock.sendall(heartbeat)
+ sock.close()
+ if verbose:
+ print heartbeat
+ except socket.error, (value, message):
+ print "%.2f: ERROR, %d - %s" % (float(time.time()), value, message)
+
+ seq += 1
+ time.sleep(interval)
+
+def get_heartbeat(seq=1):
+ return "%s %06d %.2f" % (hostname, seq, float(time.time()))
+
+def check_heartbeat(heartbeat, local_timestamp, threshold, check_drift):
+ hostname, seq, timestamp = heartbeat.rsplit()
+ timestamp = float(timestamp)
+ if client_prev_timestamp.has_key(hostname):
+ delta = local_timestamp - client_prev_timestamp[hostname]
+ if delta > threshold:
+ print "%.2f: ALERT, SLU detected on host %s, delta %ds" \
+ % (float(time.time()), hostname, delta)
+
+ client_prev_timestamp[hostname] = local_timestamp
+
+ if check_drift:
+ if not client_clock_offset.has_key(hostname):
+ client_clock_offset[hostname] = timestamp - local_timestamp
+ client_prev_drift[hostname] = 0
+ drift = timestamp - local_timestamp - client_clock_offset[hostname]
+ drift_delta = drift - client_prev_drift[hostname]
+ client_prev_drift[hostname] = drift
+ return "drift %+4.2f (%+4.2f)" % (drift, drift_delta)
+
+def check_for_timeouts(threshold, check_drift):
+ local_timestamp = float(time.time())
+ hostname_list = list(client_prev_timestamp)
+ for hostname in hostname_list:
+ timestamp = client_prev_timestamp[hostname]
+ delta = local_timestamp - timestamp
+ if delta > threshold * 2:
+ print "%.2f: ALERT, SLU detected on host %s, no heartbeat for %ds"
\
+ % (local_timestamp, hostname, delta)
+ del client_prev_timestamp[hostname]
+ if check_drift:
+ del client_clock_offset[hostname]
+ del client_prev_drift[hostname]
+
+def usage():
+ print """
+Usage:
+
+ heartbeat_slu.py --server --address <bind_address> --port <bind_port>
+ [--file <output_file>] [--no-daemon] [--verbose]
+ [--threshold <heartbeat threshold>]
+
+ heartbeat_slu.py --client --address <server_address> -p <server_port>
+ [--file output_file] [--no-daemon] [--verbose]
+ [--interval <heartbeat interval in seconds>]
+"""
+
+# host information and global data
+hostname = socket.gethostname()
+client_prev_timestamp = {}
+client_clock_offset = {}
+client_prev_drift = {}
+
+# default param values
+host_port = 9001
+host_address = ''
+interval = 1 # seconds between heartbeats
+threshold = 10 # seconds late till alert
+is_server = False
+is_daemon = True
+file_server = "/tmp/heartbeat_server.out"
+file_client = "/tmp/heartbeat_client.out"
+file_selected = None
+queue_size = 5
+verbose = False
+check_drift = False
+
+# process cmdline opts
+try:
+ opts, args = getopt.getopt(sys.argv[1:], "vhsfd:p:a:i:t:", [
+ "server", "client", "no-daemon", "address=", "port=",
+ "file=", "server", "interval=", "threshold=", "verbose",
+ "check-drift", "help"])
+except getopt.GetoptError, e:
+ print "error: %s" % str(e)
+ usage()
+ exit(1)
+
+for param, value in opts:
+ if param in ["-p", "--port"]:
+ host_port = int(value)
+ elif param in ["-a", "--address"]:
+ host_address = value
+ elif param in ["-s", "--server"]:
+ is_server = True
+ elif param in ["-c", "--client"]:
+ is_server = False
+ elif param in ["--no-daemon"]:
+ is_daemon = False
+ elif param in ["-f", "--file"]:
+ file_selected = value
+ elif param in ["-i", "--interval"]:
+ interval = int(value)
+ elif param in ["-t", "--threshold"]:
+ threshold = int(value)
+ elif param in ["-d", "--check-drift"]:
+ check_drift = True
+ elif param in ["-v", "--verbose"]:
+ verbose = True
+ elif param in ["-h", "--help"]:
+ usage()
+ exit(0)
+ else:
+ print "error: unrecognized option: %s" % value
+ usage()
+ exit(1)
+
+# run until we're terminated
+if is_server:
+ file_server = file_selected or file_server
+ run_server(host_address, host_port, is_daemon, file_server, queue_size,
threshold, check_drift)
+else:
+ file_client = file_selected or file_client
+ run_client(host_address, host_port, is_daemon, file_client, interval)
diff --git a/client/tests/kvm/tests_base.cfg.sample
b/client/tests/kvm/tests_base.cfg.sample
index 65880d8..d07eef8 100644
--- a/client/tests/kvm/tests_base.cfg.sample
+++ b/client/tests/kvm/tests_base.cfg.sample
@@ -420,6 +420,20 @@ variants:
type = smbios_table
start_vm = no
+ - softlockup: install setup unattended_install.cdrom
+ only Linux
+ type = softlockup
+ softlockup_files = stress-1.0.4.tar.gz
+ stress_setup_cmd = "cd %s && tar xvf stress-1.0.4.tar.gz && cd
stress-1.0.4 && ./configure && make && make install"
+ server_setup_cmd = "%s/heartbeat_slu.py --server --threshold %s --file
%s --verbose --check-drift"
+ client_setup_cmd = "%s/heartbeat_slu.py --client --address %s
--interval 1"
+ stress_cmd = "nohup stress -c %s & "
+ drift_cmd = "tail -1 %s | awk '{print $7}'"
+ monitor_log_file = /tmp/heartbeat_server.log
+ stress_threshold = 10
+ # time_to_run (hours) = 12,18, 24, 48 hours
+ test_duration = 12
+
- stress_boot: install setup image_copy unattended_install.cdrom
type = stress_boot
max_vms = 5
diff --git a/client/virt/tests/softlockup.py b/client/virt/tests/softlockup.py
new file mode 100644
index 0000000..51e961e
--- /dev/null
+++ b/client/virt/tests/softlockup.py
@@ -0,0 +1,88 @@
+import logging, os, socket, time
+from autotest_lib.client.bin import utils
+
+def run_softlockup(test, params, env):
+ """
+ soft lockup/ test drift test with stress.
+
+ 1) Boot up a VM
+ 2) Build stress on host and guest
+ 3) run heartbeat with the given options on server and host
+ 3) Run for longer duration. ex: 12 or ,18 or 24 hours.
+ 4) Output the test result and observe drift.
+
+ @param test: KVM test object.
+ @param params: Dictionary with the test parameters.
+ @param env: Dictionary with test environment.
+ """
+ setup_cmd = params.get("stress_setup_cmd")
+ stress_cmd = params.get("stress_cmd")
+ server_setup_cmd = params.get("server_setup_cmd")
+ drift_cmd = params.get("drift_cmd")
+ threshold = int(params.get("stress_threshold"))
+ monitor_log_file = params.get("monitor_log_file")
+ test_duration = 3600 * int(params.get("test_duration"))
+
+ vm = env.get_vm(params["main_vm"])
+ login_timeout = int(params.get("login_timeout", 360))
+ stress_dir = os.path.join(os.environ['AUTODIR'], "tests/stress")
+ monitor_dir = os.path.join(test.bindir, 'deps')
+
+ def client():
+ vm.verify_alive()
+ session = vm.wait_for_login(timeout=login_timeout)
+ session1 = vm.wait_for_login(timeout=login_timeout)
+ #Get required files and copy from host to guest
+ monitor_path = os.path.join(test.bindir, 'deps', 'heartbeat_slu.py')
+ stress_path = os.path.join(os.environ['AUTODIR'], "tests", "stress",
+ "stress-1.0.4.tar.gz")
+ vm.copy_files_to(monitor_path, "/tmp")
+ vm.copy_files_to(stress_path, "/tmp")
+
+ host_ip = socket.gethostbyname(socket.gethostname())
+ logging.info("Setup client, run stress and heartbeat on guest")
+ #Setup guest
+ session.cmd(setup_cmd % "/tmp", timeout=200)
+ #Start heartbeat on guest
+ session.cmd(params.get("client_setup_cmd") % ("/tmp", host_ip))
+ #Where <num_threads> should be twice the number of vcpus allocated to
+ #the guest.
+ num_threads = 2* int(params.get("smp", 1))
+ #Run stress test
+ session1.cmd(stress_cmd % num_threads, timeout=test_duration)
+ session.close()
+ #sleep for test duration
+ time.sleep(test_duration)
+ wait_for_timeout()
+
+
+ def server():
+ #Get number of threads to run stress. where <num_threads> should be
+ #twice the number of hardware/hyper threads
+ threads = 2 * utils.count_cpus()
+ logging.info("Setup server, run stress and heartbeat on host")
+
+ #Setup server
+ utils.run(setup_cmd % stress_dir)
+
+ #Run heartbeat script
+ utils.run(server_setup_cmd % (monitor_dir, threshold,
+ monitor_log_file))
+ #Run stress test, as it generates several types of stress
+ #(CPU,IO, network)
+ utils.run(stress_cmd % threads, timeout=test_duration)
+
+
+ def wait_for_timeout():
+ session2 = vm.wait_for_login(timeout=login_timeout)
+ #kill stress, heartbeat on host and guest
+ session2.cmd("pkill -f stress")
+ session2.cmd("ps aux | grep heart| grep -v grep| awk '{print$2}'|
xargs kill -9")
+ utils.run("pkill -f stress")
+ utils.run("ps aux | grep heart | grep -v grep | awk '{print$2}' |
xargs kill -9")
+ #Collect drift
+ drift = utils.system_output(drift_cmd % monitor_log_file)
+ logging.info("Drift noticed %s", drift)
+
+ server()
+ client()
--
1.7.0.4
_______________________________________________
Autotest mailing list
[email protected]
http://test.kernel.org/cgi-bin/mailman/listinfo/autotest