On Thu, Jun 23, 2011 at 11:48 PM, Pradeep Kumar
<[email protected]> wrote:
> earlier sent a different patch. please ignore it.

Hi Pradeep, awesome. Thanks for the patches! I am reading them.

However, I've noticed you are generating your patches with raw diff,
which is a real pain to do manually.

So, may I suggest that you start using git to write your patchsets to
autotest? It's really easy and pleasant to do so. Please refer to:

http://autotest.kernel.org/wiki/DownloadSource
http://autotest.kernel.org/wiki/GitWorkflow

It's OK to send the tarballs separately, as in git they'd generate
huge binary diffs, but for source code itself, please consider using
git.

Cheers,

Lucas

> Soft Lockup/Clock Drift Test Setup
>
> The basic setup used for testing for soft lockup and clock drift issues 
> amongst guests
> in our environment was to generate guest and host cpu load to induce high 
> scheduling
> latencies for guest vcpus. Guests were then monitored from a seperate machine 
> using a
> "heartbeat" monitor that records periodic heartbeat messages from the guests 
> and compares
>  guest timestamp information to the local timestamp to calculate clock drift.
>
>
> Heartbeats that are late by a certain number of seconds, in our case 
> configured to be 10seconds,
> as well as heartbeats which have not been delivered for some period of time, 
> cause soft lockup
> alerts to be generated in the heartbeat monitor's log file. It is important 
> to note that other
> factors, such as network issues, can account for these delayed/missed 
> heartbeats.
> Actual data on  # of soft lockups should be collected from within each guest.
>
> Thanks Mike Roath for providing necessary help.
>
> Signed-off-by: Pradeep K Surisetty <[email protected]>
> ---
> diff -uprN autotest/client/tests/stress/heartbeat_slu.py 
> autotest-new/client/tests/stress/heartbeat_slu.py
> --- autotest/client/tests/stress/heartbeat_slu.py       1970-01-01 
> 05:30:00.000000000 +0530
> +++ autotest-new/client/tests/stress/heartbeat_slu.py   2011-06-24 
> 07:54:25.667574558 +0530
> @@ -0,0 +1,205 @@
> +#!/usr/bin/env python
> +
> +"""
> +Heartbeat server/client to detect soft lockups
> +"""
> +
> +import socket, os, sys, time, getopt
> +
> +def daemonize(output_file):
> +    try:
> +        pid = os.fork()
> +    except OSError, e:
> +        raise Exception, "error %d: %s" % (e.strerror, e.errno)
> +
> +    if pid:
> +        os._exit(0)
> +
> +    os.umask(0)
> +    os.setsid()
> +    sys.stdout.flush()
> +    sys.stderr.flush()
> +
> +    if file:
> +        output_handle = file(output_file, 'a+', 0)
> +        # autoflush stdout/stderr
> +        sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
> +        sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', 0)
> +    else:
> +        output_handle = file('/dev/null', 'a+')
> +
> +    stdin_handle = open('/dev/null', 'r')
> +    os.dup2(output_handle.fileno(), sys.stdout.fileno())
> +    os.dup2(output_handle.fileno(), sys.stderr.fileno())
> +    os.dup2(stdin_handle.fileno(), sys.stdin.fileno())
> +
> +def recv_all(sock):
> +    total_data = []
> +    while True:
> +        data = sock.recv(1024)
> +        if not data:
> +            break
> +        total_data.append(data)
> +    return ''.join(total_data)
> +
> +def run_server(host, port, daemon, file, queue_size, threshold, drift):
> +    if daemon:
> +        daemonize(output_file=file)
> +    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
> +    sock.bind((host, port))
> +    sock.listen(queue_size)
> +    timeout_interval = threshold * 2
> +    prev_check_timestamp = float(time.time())
> +    while 1:
> +        c_sock, c_addr = sock.accept()
> +        heartbeat = recv_all(c_sock)
> +        local_timestamp = float(time.time())
> +        drift = check_heartbeat(heartbeat, local_timestamp, threshold, 
> check_drift)
> +        # NOTE: this doesn't work if the only client is the one that timed
> +        # out, but anything more complete would require another thread and
> +        # a lock for client_prev_timestamp.
> +        if local_timestamp - prev_check_timestamp > threshold * 2.0:
> +            check_for_timeouts(threshold, check_drift)
> +            prev_check_timestamp = local_timestamp
> +        if verbose:
> +            if check_drift:
> +                print "%.2f: %s (%s)" % (local_timestamp, heartbeat, drift)
> +            else:
> +                print "%.2f: %s" % (local_timestamp, heartbeat)
> +
> +def run_client(host, port, daemon, file, interval):
> +    if daemon:
> +        daemonize(output_file=file)
> +    seq = 1
> +    while 1:
> +        try:
> +            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
> +            sock.connect((host, port))
> +            heartbeat = get_heartbeat(seq)
> +            sock.sendall(heartbeat)
> +            sock.close()
> +            if verbose:
> +                print heartbeat
> +        except socket.error, (value, message):
> +            print "%.2f: ERROR, %d - %s" % (float(time.time()), value, 
> message)
> +
> +        seq += 1
> +        time.sleep(interval)
> +
> +def get_heartbeat(seq=1):
> +    return "%s %06d %.2f" % (hostname, seq, float(time.time()))
> +
> +def check_heartbeat(heartbeat, local_timestamp, threshold, check_drift):
> +    hostname, seq, timestamp = heartbeat.rsplit()
> +    timestamp = float(timestamp)
> +    if client_prev_timestamp.has_key(hostname):
> +        delta = local_timestamp - client_prev_timestamp[hostname]
> +        if delta > threshold:
> +            print "%.2f: ALERT, SLU detected on host %s, delta %ds" \
> +                % (float(time.time()), hostname, delta)
> +
> +    client_prev_timestamp[hostname] = local_timestamp
> +
> +    if check_drift:
> +        if not client_clock_offset.has_key(hostname):
> +            client_clock_offset[hostname] = timestamp - local_timestamp
> +            client_prev_drift[hostname] = 0
> +        drift = timestamp - local_timestamp - client_clock_offset[hostname]
> +        drift_delta = drift - client_prev_drift[hostname]
> +        client_prev_drift[hostname] = drift
> +        return "drift %+4.2f (%+4.2f)" % (drift, drift_delta)
> +
> +def check_for_timeouts(threshold, check_drift):
> +    local_timestamp = float(time.time())
> +    hostname_list = list(client_prev_timestamp)
> +    for hostname in hostname_list:
> +        timestamp = client_prev_timestamp[hostname]
> +        delta = local_timestamp - timestamp
> +        if delta > threshold * 2:
> +            print "%.2f: ALERT, SLU detected on host %s, no heartbeat for 
> %ds" \
> +                % (local_timestamp, hostname, delta)
> +            del client_prev_timestamp[hostname]
> +            if check_drift:
> +                del client_clock_offset[hostname]
> +                del client_prev_drift[hostname]
> +
> +def usage():
> +    print """
> +Usage:
> +
> +    heartbeat_slu.py --server --address <bind_address> --port <bind_port>
> +                     [--file <output_file>] [--no-daemon] [--verbose]
> +                     [--threshold <heartbeat threshold>]
> +
> +    heartbeat_slu.py --client --address <server_address> -p <server_port>
> +                     [--file output_file] [--no-daemon] [--verbose]
> +                     [--interval <heartbeat interval in seconds>]
> +"""
> +
> +# host information and global data
> +hostname = socket.gethostname()
> +client_prev_timestamp = {}
> +client_clock_offset = {}
> +client_prev_drift = {}
> +
> +# default param values
> +host_port = 9001
> +host_address = ''
> +interval = 1 # seconds between heartbeats
> +threshold = 10 # seconds late till alert
> +is_server = False
> +is_daemon = True
> +file_server = "/tmp/heartbeat_server.out"
> +file_client = "/tmp/heartbeat_client.out"
> +file_selected = None
> +queue_size = 5
> +verbose = False
> +check_drift = False
> +
> +# process cmdline opts
> +try:
> +    opts, args = getopt.getopt(sys.argv[1:], "vhsfd:p:a:i:t:", [
> +                    "server", "client", "no-daemon", "address=", "port=",
> +                    "file=", "server", "interval=", "threshold=", "verbose",
> +                    "check-drift", "help"])
> +except getopt.GetoptError, e:
> +    print "error: %s" % str(e)
> +    usage()
> +    exit(1)
> +
> +for param, value in opts:
> +    if param in ["-p", "--port"]:
> +        host_port = int(value)
> +    elif param in ["-a", "--address"]:
> +        host_address = value
> +    elif param in ["-s", "--server"]:
> +        is_server = True
> +    elif param in ["-c", "--client"]:
> +        is_server = False
> +    elif param in ["--no-daemon"]:
> +        is_daemon = False
> +    elif param in ["-f", "--file"]:
> +        file_selected = value
> +    elif param in ["-i", "--interval"]:
> +        interval = int(value)
> +    elif param in ["-t", "--threshold"]:
> +        threshold = int(value)
> +    elif param in ["-d", "--check-drift"]:
> +        check_drift = True
> +    elif param in ["-v", "--verbose"]:
> +        verbose = True
> +    elif param in ["-h", "--help"]:
> +        usage()
> +        exit(0)
> +    else:
> +        print "error: unrecognized option: %s" % value
> +        usage()
> +        exit(1)
> +
> +# run until we're terminated
> +if is_server:
> +    file_server = file_selected or file_server
> +    run_server(host_address, host_port, is_daemon, file_server, queue_size, 
> threshold, check_drift)
> +else:
> +    file_client = file_selected or file_client
> +    run_client(host_address, host_port, is_daemon, file_client, interval)
> Binary files autotest/client/tests/stress/stress-1.0.0.tar.gz and 
> autotest-new/client/tests/stress/stress-1.0.0.tar.gz differ
> Binary files autotest/client/tests/stress/stress-1.0.4.tar.gz and 
> autotest-new/client/tests/stress/stress-1.0.4.tar.gz differ
> diff -uprN autotest/client/tests/stress/stress.py 
> autotest-new/client/tests/stress/stress.py
> --- autotest/client/tests/stress/stress.py      2011-06-24 07:49:23.853373401 
> +0530
> +++ autotest-new/client/tests/stress/stress.py  2011-06-24 07:53:49.848420945 
> +0530
> @@ -17,8 +17,8 @@ class stress(test.test):
>         self.job.require_gcc()
>
>
> -    # http://weather.ou.edu/~apw/projects/stress/stress-1.0.0.tar.gz
> -    def setup(self, tarball = 'stress-1.0.0.tar.gz'):
> +    # http://weather.ou.edu/~apw/projects/stress/stress-1.0.4.tar.gz
> +    def setup(self, tarball = 'stress-1.0.4.tar.gz'):
>         tarball = utils.unmap_url(self.bindir, tarball, self.tmpdir)
>         utils.extract_tarball_to_dir(tarball, self.srcdir)
>         os.chdir(self.srcdir)
> ---
> _______________________________________________
> Autotest mailing list
> [email protected]
> http://test.kernel.org/cgi-bin/mailman/listinfo/autotest
>



-- 
Lucas
_______________________________________________
Autotest mailing list
[email protected]
http://test.kernel.org/cgi-bin/mailman/listinfo/autotest

Reply via email to