From: Shane Bradley <[email protected]> Fix the header to use absolute path for rhbz#987019. The "-o" has been modified so that it takes a path that will be used to create the root directory of where the data will be written and will be location of the tarball that is created. Added undocumented(private) "-R" option that will only capture required data. Removed the data capture of the command "lsof" cause it might cause hangs when capturing lockdumps cause of symlink lookup.
Signed-off-by: Shane Bradley <[email protected]> --- gfs2/scripts/gfs2_lockcapture | 181 +++++++++++++++++++++++------------------ 1 files changed, 101 insertions(+), 80 deletions(-) diff --git a/gfs2/scripts/gfs2_lockcapture b/gfs2/scripts/gfs2_lockcapture index 81a0aeb..f8ace76 100644 --- a/gfs2/scripts/gfs2_lockcapture +++ b/gfs2/scripts/gfs2_lockcapture @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python """ The script "gfs2_lockcapture" will capture locking information from GFS2 file systems and DLM. @@ -13,7 +13,7 @@ import os import os.path import logging import logging.handlers -from optparse import OptionParser, Option +from optparse import OptionParser, Option, SUPPRESS_HELP import time import platform import shutil @@ -34,7 +34,7 @@ import tarfile sure only 1 instance of this script is running at any time. @type PATH_TO_PID_FILENAME: String """ -VERSION_NUMBER = "0.9-7" +VERSION_NUMBER = "0.9-8" MAIN_LOGGER_NAME = "%s" %(os.path.basename(sys.argv[0])) PATH_TO_DEBUG_DIR="/sys/kernel/debug" PATH_TO_PID_FILENAME = "/var/run/%s.pid" %(os.path.basename(sys.argv[0])) @@ -190,12 +190,11 @@ def runCommand(command, listOfCommandOptions, standardOut=subprocess.PIPE, stand commandOptionString = "" for option in listOfCommandOptions: commandOptionString += "%s " %(option) - message = "An error occurred running the command: $ %s %s\n" %(command, commandOptionString) - if (len(stdout) > 0): - message += stdout - message += "\n" - if (len(stderr) > 0): - message += stderr + message = "An error occurred running the command: $ %s %s" %(command, commandOptionString) + if (len(stdout.rstrip()) > 0): + message += "\n%s" %(stdout.rstrip()) + if (len(stderr.rstrip()) > 0): + message += "\n%s" %(stderr.rstrip()) logging.getLogger(MAIN_LOGGER_NAME).error(message) return False @@ -232,12 +231,11 @@ def runCommandOutput(command, listOfCommandOptions, standardOut=subprocess.PIPE, commandOptionString = "" for option in listOfCommandOptions: commandOptionString += "%s " %(option) - message = "An error occurred running the command: $ %s %s\n" %(command, commandOptionString) - if (len(stdout) > 0): - message += stdout - message += "\n" - if (len(stderr) > 0): - message += stderr + message = "An error occurred running the command: $ %s %s" %(command, commandOptionString) + if (len(stdout.rstrip()) > 0): + message += "\n%s" %(stdout.rstrip()) + if (len(stderr.rstrip()) > 0): + message += "\n%s" %(stderr.rstrip()) logging.getLogger(MAIN_LOGGER_NAME).error(message) return None return stdout.strip().rstrip() @@ -790,12 +788,11 @@ def getLabelMapForMountedFilesystems(clusterName, listOfMountedFilesystems): # ##################################################################### # Gather output from command functions # ##################################################################### -def gatherGeneralInformation(pathToDSTDir): +def gatherHostData(pathToDSTDir): """ This function will gather general information about the cluster and write the results to a file. The following data will be captured: hostname, date, - uname -a, uptime, contents of /proc/mounts, and ps h -AL -o tid,s,cmd. - + uname -a, uptime. @param pathToDSTDir: This is the path to directory where the files will be written to. @@ -811,19 +808,16 @@ def gatherGeneralInformation(pathToDSTDir): systemString += "UPTIME=%s" %(stdout) writeToFile(os.path.join(pathToDSTDir, "hostinformation.txt"), systemString, createFile=True) - # Copy misc files - pathToSrcFile = "/proc/mounts" - copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) - pathToSrcFile = "/proc/slabinfo" - copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) +def gatherDiagnosticData(pathToDSTDir): + """ + This function will gather general information about the cluster and write (or + copy) the results to a file. - # Copy the DLM hash table sizes: - pathToHashTableFiles = ["/sys/kernel/config/dlm/cluster/lkbtbl_size", "/sys/kernel/config/dlm/cluster/dirtbl_size", - "/sys/kernel/config/dlm/cluster/rsbtbl_size"] - for pathToSrcFile in pathToHashTableFiles: - if (os.path.exists(pathToSrcFile)): - copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) + @param pathToDSTDir: This is the path to directory where the files will be + written to. + @type pathToDSTDir: String + """ # Get "ps -eo user,pid,%cpu,%mem,vsz,rss,tty,stat,start,time,comm,wchan" data. # Get " ps h -AL -o tid,s,cmd command = "ps" @@ -837,6 +831,28 @@ def gatherGeneralInformation(pathToDSTDir): message = "There was an error writing the command output for %s to the file %s." %(command, pathToCommandOutput) logging.getLogger(MAIN_LOGGER_NAME).error(message) + # Copy misc files + pathToSrcFile = "/proc/mounts" + copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) + pathToSrcFile = "/proc/slabinfo" + copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) + + # Copy the DLM hash table sizes: + pathToHashTableFiles = ["/sys/kernel/config/dlm/cluster/lkbtbl_size", "/sys/kernel/config/dlm/cluster/dirtbl_size", + "/sys/kernel/config/dlm/cluster/rsbtbl_size"] + for pathToSrcFile in pathToHashTableFiles: + if (os.path.exists(pathToSrcFile)): + copyFile(pathToSrcFile, os.path.join(pathToDSTDir, pathToSrcFile.strip("/"))) + +def gatherOptionalDiagnosticData(pathToDSTDir): + """ + This function will gather optional information about the cluster and write + the results to a file. + + @param pathToDSTDir: This is the path to directory where the files will be + written to. + @type pathToDSTDir: String + """ # Get df -h ouput command = "df" pathToCommandOutput = os.path.join(pathToDSTDir, "df-h.cmd") @@ -848,17 +864,6 @@ def gatherGeneralInformation(pathToDSTDir): message = "There was an error writing the command output for %s to the file %s." %(command, pathToCommandOutput) logging.getLogger(MAIN_LOGGER_NAME).error(message) - # Get lsof ouput - command = "lsof" - pathToCommandOutput = os.path.join(pathToDSTDir, "lsof.cmd") - try: - fout = open(pathToCommandOutput, "w") - runCommand(command, [], standardOut=fout) - fout.close() - except IOError: - message = "There was an error writing the command output for %s to the file %s." %(command, pathToCommandOutput) - logging.getLogger(MAIN_LOGGER_NAME).error(message) - # Write the status of all the nodes in the cluster out. if (runCommand("which", ["cman_tool"])): command = "cman_tool" @@ -1087,7 +1092,12 @@ def __getOptions(version) : cmdParser.add_option("-P", "--disable_process_gather", action="store_true", dest="disableProcessGather", - help="the gathering of process information will be disabled", + help=SUPPRESS_HELP, + default=False) + cmdParser.add_option("-m", "--diagnostic_data", + action="store_true", + dest="enableDiagnosticData", + help=SUPPRESS_HELP, default=False) cmdParser.add_option("-o", "--path_to_output_dir", action="store", @@ -1095,7 +1105,7 @@ def __getOptions(version) : help="the directory where all the collect data will be stored", type="string", metavar="<output directory>", - default="") + default="/tmp") cmdParser.add_option("-r", "--num_of_runs", action="store", dest="numberOfRuns", @@ -1264,7 +1274,7 @@ if __name__ == "__main__": message = "Debugging has been enabled." logging.getLogger(MAIN_LOGGER_NAME).debug(message) if (cmdLineOpts.disableLoggingToConsole): - logging.disable(logging.CRITICAL) + streamHandler.setLevel(logging.CRITICAL) # ####################################################################### # Check to see if pid file exists and error if it does. # ####################################################################### @@ -1305,7 +1315,7 @@ if __name__ == "__main__": # ####################################################################### # Verify they want to continue because this script will trigger sysrq events. # ####################################################################### - if (not cmdLineOpts.disableQuestions): + if (not cmdLineOpts.disableQuestions and not cmdLineOpts.disableProcessGather): valid = {"yes":True, "y":True, "no":False, "n":False} question = "This script will trigger a sysrq -t event or collect the data for each pid directory located in /proc for each run. Are you sure you want to continue?" prompt = " [y/n] " @@ -1326,14 +1336,11 @@ if __name__ == "__main__": # Create the output directory to verify it can be created before # proceeding unless it is already created from a previous run data needs # to be analyzed. Probably could add more debugging on if file or dir. - # ####################################################################### - pathToOutputDir = cmdLineOpts.pathToOutputDir - if (not len(pathToOutputDir) > 0): - pathToOutputDir = "%s" %(os.path.join("/tmp", "%s-%s" %(time.strftime("%Y-%m-%d_%H%M%S"), os.path.basename(sys.argv[0])))) - # ####################################################################### + # Backup any existing directory with same name as current output # directory. # ####################################################################### + pathToOutputDir = "%s" %(os.path.join(cmdLineOpts.pathToOutputDir, "%s-%s" %(os.path.basename(sys.argv[0]), time.strftime("%Y-%m-%d")))) if (backupOutputDirectory(pathToOutputDir)): message = "This directory that will be used to capture all the data: %s" %(pathToOutputDir) logging.getLogger(MAIN_LOGGER_NAME).info(message) @@ -1388,38 +1395,13 @@ if __name__ == "__main__": logging.getLogger(MAIN_LOGGER_NAME).status(message) # Gather various bits of data from the clusternode. - message = "Pass (%d/%d): Gathering general information about the host." %(i, cmdLineOpts.numberOfRuns) + message = "Pass (%d/%d): Gathering simple data about the host." %(i, cmdLineOpts.numberOfRuns) logging.getLogger(MAIN_LOGGER_NAME).debug(message) - gatherGeneralInformation(pathToOutputRunDir) + gatherHostData(pathToOutputRunDir) # Write the clusternode name and id to the general information file. writeToFile(os.path.join(pathToOutputRunDir, "hostinformation.txt"), "NODE_NAME=%s\nNODE_ID=%d" %(clusternode.getClusterNodeName(), clusternode.getClusterNodeID()), appendToFile=True, createFile=True) - - # Going to sleep for 2 seconds, so that TIMESTAMP should be in the - # past in the logs so that capturing sysrq data will be guaranteed. - time.sleep(2) - - # If enabled then gather the process data. - if (not cmdLineOpts.disableProcessGather): - # Gather the backtraces for all the pids, by grabbing the /proc/<pid - # number> or triggering sysrq events to capture task bask traces - # from log. - # Gather the data in the /proc/<pid> directory if the file - # </proc/<pid>/stack exists. If file exists we will not trigger - # sysrq events. - - # Should I gather anyhow and only capture sysrq if needed. - pathToPidData = "/proc" - if (isProcPidStackEnabled(pathToPidData)): - message = "Pass (%d/%d): Triggering the capture of all pid directories in %s." %(i, cmdLineOpts.numberOfRuns, pathToPidData) - logging.getLogger(MAIN_LOGGER_NAME).debug(message) - gatherPidData(pathToPidData, os.path.join(pathToOutputRunDir, pathToPidData.strip("/"))) - else: - message = "Pass (%d/%d): Triggering the sysrq events for the host since stack was not captured in pid directory." %(i, cmdLineOpts.numberOfRuns) - logging.getLogger(MAIN_LOGGER_NAME).debug(message) - triggerSysRQEvents() - # ####################################################################### # Gather the DLM data and lock-dumps # ####################################################################### @@ -1444,16 +1426,55 @@ if __name__ == "__main__": logging.getLogger(MAIN_LOGGER_NAME).debug(message) if(gatherGFS2LockDumps(pathToOutputRunDir, clusternode.getMountedGFS2FilesystemNames())): exitCode = 0 + # If enabled then gather the process data. This will be included even if -R option is enabled. + if (not cmdLineOpts.disableProcessGather): + # Gather the backtraces for all the pids, by grabbing the /proc/<pid + # number> or triggering sysrq events to capture task bask traces + # from log. + # Gather the data in the /proc/<pid> directory if the file + # </proc/<pid>/stack exists. If file exists we will not trigger + # sysrq events. + + # Should I gather anyhow and only capture sysrq if needed. + pathToPidData = "/proc" + if (isProcPidStackEnabled(pathToPidData)): + message = "Pass (%d/%d): Triggering the capture of all pid directories in %s." %(i, cmdLineOpts.numberOfRuns, pathToPidData) + logging.getLogger(MAIN_LOGGER_NAME).debug(message) + gatherPidData(pathToPidData, os.path.join(pathToOutputRunDir, pathToPidData.strip("/"))) + else: + message = "Pass (%d/%d): Triggering the sysrq events for the host since stack was not captured in pid directory." %(i, cmdLineOpts.numberOfRuns) + logging.getLogger(MAIN_LOGGER_NAME).debug(message) + triggerSysRQEvents() + # Gather log files message = "Pass (%d/%d): Gathering the log files for the host." %(i, cmdLineOpts.numberOfRuns) logging.getLogger(MAIN_LOGGER_NAME).debug(message) gatherLogs(os.path.join(pathToOutputRunDir, "logs")) + + # Gather diagnostic data + message = "Pass (%d/%d): Gathering diagnostic data about the host." %(i, cmdLineOpts.numberOfRuns) + logging.getLogger(MAIN_LOGGER_NAME).debug(message) + gatherDiagnosticData(pathToOutputRunDir) + if (cmdLineOpts.enableDiagnosticData): + # Gather diagnostic data + message = "Pass (%d/%d): Gathering optional diagnostic data about the host." %(i, cmdLineOpts.numberOfRuns) + logging.getLogger(MAIN_LOGGER_NAME).debug(message) + gatherOptionalDiagnosticData(pathToOutputRunDir) + + # ####################################################################### + # Sleep for X seconds between runs + # ####################################################################### # Sleep between each run if secondsToSleep is greater than or equal - # to 0 and current run is not the last run. - if ((cmdLineOpts.secondsToSleep >= 0) and (i < (cmdLineOpts.numberOfRuns))): - message = "The script will sleep for %d seconds between each run of capturing the lockdump data." %(cmdLineOpts.secondsToSleep) + # to 0 and current run is not the last run. Add 2 seconds to each sleep so + # that we know that there is a timestamp difference in logs between runs. + # The minimal sleep is 2 seconds. + secondsToSleep = cmdLineOpts.secondsToSleep + 2 + if (secondsToSleep < 2): + secondsToSleep = 2 + if (i < cmdLineOpts.numberOfRuns): + message = "The script will sleep for %d seconds between each run of capturing the lockdump data." %(secondsToSleep) logging.getLogger(MAIN_LOGGER_NAME).info(message) - time.sleep(cmdLineOpts.secondsToSleep) + time.sleep(secondsToSleep) # Remove the handler: logging.getLogger(MAIN_LOGGER_NAME).removeHandler(currentRunFileHandler) -- 1.7.1
