Repository: trafodion Updated Branches: refs/heads/master 06b648612 -> d83b97efe
[TRAFODION-3225] Obscure cores seen in RMS and logger related code when Trafodion is stressed Process de-registration is now logged to understand the spate of cores seen during stress testing Cleanup dangling semaphore incorrectly detected the problem due to pid recycling and hence de-registered the process and its query fragment from the shared segment When the RMS semaphore is held for more than 10 seconds, it is assumed to be in a deadlock situation and hence the process holding the semaphore is core dumped for further analysis. Create a global logger instance instead of a function static variable to help in debugging when problems with QRlogger is reported. Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/6a25d5c1 Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/6a25d5c1 Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/6a25d5c1 Branch: refs/heads/master Commit: 6a25d5c17ade2140759dab9a6580979147d0e26a Parents: 06b6486 Author: selvaganesang <[email protected]> Authored: Tue Oct 30 17:21:45 2018 +0000 Committer: selvaganesang <[email protected]> Committed: Tue Oct 30 17:34:58 2018 +0000 ---------------------------------------------------------------------- core/sql/bin/SqlciMain.cpp | 1 + core/sql/cli/Globals.cpp | 6 +- core/sql/executor/ex_esp_frag_dir.cpp | 6 +- core/sql/qmscommon/QRLogger.cpp | 6 +- core/sql/qmscommon/QRLogger.h | 2 + core/sql/runtimestats/SqlStats.cpp | 113 +++++++++++++++++++---------- core/sql/runtimestats/SqlStats.h | 4 +- core/sql/runtimestats/sscpipc.cpp | 4 +- 8 files changed, 99 insertions(+), 43 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/bin/SqlciMain.cpp ---------------------------------------------------------------------- diff --git a/core/sql/bin/SqlciMain.cpp b/core/sql/bin/SqlciMain.cpp index fc02077..b45f108 100644 --- a/core/sql/bin/SqlciMain.cpp +++ b/core/sql/bin/SqlciMain.cpp @@ -216,6 +216,7 @@ _callable void removeProcess() { statsGlobals->removeProcess(cliGlobals->myPin()); statsGlobals->releaseStatsSemaphore(cliGlobals->getSemId(), cliGlobals->myPin()); + statsGlobals->logProcessDeath(cliGlobals->myCpu(), cliGlobals->myPin(), "Normal process death"); } else if (error == 4011) { http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/cli/Globals.cpp ---------------------------------------------------------------------- diff --git a/core/sql/cli/Globals.cpp b/core/sql/cli/Globals.cpp index 6f498fb..014a2ef 100644 --- a/core/sql/cli/Globals.cpp +++ b/core/sql/cli/Globals.cpp @@ -206,6 +206,7 @@ void CliGlobals::init( NABoolean espProcess, } else { + bool reincarnated; error = statsGlobals_->getStatsSemaphore(semId_, myPin_); statsHeap_ = (NAHeap *)statsGlobals_-> @@ -221,10 +222,12 @@ void CliGlobals::init( NABoolean espProcess, NAHeap("Process Stats Heap", statsGlobals_->getStatsHeap(), 8192, 0); - statsGlobals_->addProcess(myPin_, statsHeap_); + reincarnated = statsGlobals_->addProcess(myPin_, statsHeap_); processStats_ = statsGlobals_->getExProcessStats(myPin_); processStats_->setStartTime(myStartTime_); statsGlobals_->releaseStatsSemaphore(semId_, myPin_); + if (reincarnated) + statsGlobals_->logProcessDeath(myCpu_, myPin_, "Process reincarnated before RIP"); } } // create a default context and make it the current context @@ -298,6 +301,7 @@ CliGlobals::~CliGlobals() error = statsGlobals_->getStatsSemaphore(semId_, myPin_); statsGlobals_->removeProcess(myPin_); statsGlobals_->releaseStatsSemaphore(semId_, myPin_); + statsGlobals_->logProcessDeath(myCpu_, myPin_, "Normal process death"); sem_close((sem_t *)semId_); } } http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/executor/ex_esp_frag_dir.cpp ---------------------------------------------------------------------- diff --git a/core/sql/executor/ex_esp_frag_dir.cpp b/core/sql/executor/ex_esp_frag_dir.cpp index 6825ec0..a880ae8 100644 --- a/core/sql/executor/ex_esp_frag_dir.cpp +++ b/core/sql/executor/ex_esp_frag_dir.cpp @@ -131,6 +131,7 @@ ExEspFragInstanceDir::ExEspFragInstanceDir(CliGlobals *cliGlobals, } else { + bool reincarnated; cliGlobals_->setStatsGlobals(statsGlobals_); cliGlobals_->setSemId(semId_); error = statsGlobals_->getStatsSemaphore(semId_, pid_); @@ -142,12 +143,14 @@ ExEspFragInstanceDir::ExEspFragInstanceDir(CliGlobals *cliGlobals, // We need to set up the cliGlobals, since addProcess will call getRTSSemaphore // and it uses these members cliGlobals_->setStatsHeap(statsHeap_); - statsGlobals_->addProcess(pid_, statsHeap_); + reincarnated = statsGlobals_->addProcess(pid_, statsHeap_); ExProcessStats *processStats = statsGlobals_->getExProcessStats(pid_); processStats->setStartTime(cliGlobals_->myStartTime()); cliGlobals_->setExProcessStats(processStats); statsGlobals_->releaseStatsSemaphore(semId_, pid_); + if (reincarnated) + statsGlobals_->logProcessDeath(cpu_, pid_, "Process reincarnated before RIP"); } } cliGlobals_->setStatsHeap(statsHeap_); @@ -191,6 +194,7 @@ ExEspFragInstanceDir::~ExEspFragInstanceDir() int error = statsGlobals_->getStatsSemaphore(semId_, pid_); statsGlobals_->removeProcess(pid_); statsGlobals_->releaseStatsSemaphore(semId_, pid_); + statsGlobals_->logProcessDeath(cpu_, pid_, "Normal process death"); sem_close((sem_t *)semId_); } } http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/qmscommon/QRLogger.cpp ---------------------------------------------------------------------- diff --git a/core/sql/qmscommon/QRLogger.cpp b/core/sql/qmscommon/QRLogger.cpp index 850c7d0..bb32f4f 100644 --- a/core/sql/qmscommon/QRLogger.cpp +++ b/core/sql/qmscommon/QRLogger.cpp @@ -41,6 +41,7 @@ #include "seabed/fserr.h" BOOL gv_QRLoggerInitialized_ = FALSE; +QRLogger *gv_QRLoggerInstance_ = NULL; using namespace log4cxx; using namespace log4cxx::helpers; @@ -99,8 +100,9 @@ QRLogger::QRLogger() // ************************************************************************** QRLogger& QRLogger::instance() { - static QRLogger onlyInstance_; - return onlyInstance_; + if (gv_QRLoggerInstance_ == NULL) + gv_QRLoggerInstance_ = new QRLogger(); + return *gv_QRLoggerInstance_; } http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/qmscommon/QRLogger.h ---------------------------------------------------------------------- diff --git a/core/sql/qmscommon/QRLogger.h b/core/sql/qmscommon/QRLogger.h index 6e8244c..cd3f399 100644 --- a/core/sql/qmscommon/QRLogger.h +++ b/core/sql/qmscommon/QRLogger.h @@ -28,6 +28,8 @@ #include "NAString.h" #include "CommonLogger.h" +class QRLogger; +extern QRLogger *gv_QRLoggerInstance_; // ----- these categories are currently not used // qmscomon extern std::string CAT_SQL_COMP_QR_COMMON; http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/runtimestats/SqlStats.cpp ---------------------------------------------------------------------- diff --git a/core/sql/runtimestats/SqlStats.cpp b/core/sql/runtimestats/SqlStats.cpp index 3fcf9d3..58f8eb4 100644 --- a/core/sql/runtimestats/SqlStats.cpp +++ b/core/sql/runtimestats/SqlStats.cpp @@ -194,17 +194,15 @@ const char *StatsGlobals::rmsEnvType(RTSEnvType envType) } } -void StatsGlobals::addProcess(pid_t pid, NAHeap *heap) +bool StatsGlobals::addProcess(pid_t pid, NAHeap *heap) { + bool pidReincarnated = false; if (pid >= configuredPidMax_) - return; + return pidReincarnated; char msg[256];; if (statsArray_[pid].processStats_ != NULL) { - snprintf(msg, sizeof(msg), - "Pid %d,%d got recycled soon or SSMP didn't receive the death message ", - cpu_, pid); - SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, msg, 0); + pidReincarnated = true; removeProcess(pid, TRUE); } statsArray_[pid].processId_ = pid; @@ -214,7 +212,7 @@ void StatsGlobals::addProcess(pid_t pid, NAHeap *heap) incProcessStatsHeaps(); if (pid > maxPid_) maxPid_ = pid; - return; + return pidReincarnated; } void StatsGlobals::removeProcess(pid_t pid, NABoolean calledAtAdd) @@ -304,6 +302,8 @@ void StatsGlobals::checkForDeadProcesses(pid_t myPid) error = getStatsSemaphore(ssmpProcSemId_, myPid); int pidRemainingToCheck = 20; + pid_t pidsRemoved[pidRemainingToCheck]; + int numPidsRemoved = 0; pid_t firstPid = pidToCheck_; for (;maxPid_ > 0;) { @@ -326,11 +326,16 @@ void StatsGlobals::checkForDeadProcesses(pid_t myPid) Int32 ln_error = msg_mon_get_process_name( cpu_, statsArray_[pidToCheck_].processId_, processName); - if (ln_error == XZFIL_ERR_NOSUCHDEV) - removeProcess(pidToCheck_); + if (ln_error == XZFIL_ERR_NOSUCHDEV) { + pidsRemoved[numPidsRemoved++]; + removeProcess(pidToCheck_); + } } } releaseStatsSemaphore(ssmpProcSemId_, myPid); + // death messages are logged outside of semaphore due to process hop + for (int i = 0 ; i < numPidsRemoved ; i++) + logProcessDeath(cpu_, pidsRemoved[i], "Dead Process"); } // We expect a death message to be delivered to MXSSMP by the monitor @@ -340,48 +345,55 @@ void StatsGlobals::checkForDeadProcesses(pid_t myPid) // be unexpectedly holding the stats semaphore. static Int32 SemHeldTooManySeconds = -1; +static bool dumpRmsDeadLockProcess = true; void StatsGlobals::cleanupDanglingSemaphore(NABoolean checkForSemaphoreHolders) { CliGlobals *cliGlobals = GetCliGlobals(); NABoolean cleanupSemaphore = FALSE; + char coreFile[512]; if (semPid_ == -1) - return; // Nobody has the semaphore, nothing to do here. + return; // Nobody has the semaphore, nothing to do here. if (NOT (cliGlobals->myPin() == getSsmpPid() && cliGlobals->myStartTime() == getSsmpTimestamp())) - return; // Only ssmp is allowed to cleanup after another process. + return; // Only ssmp is allowed to cleanup after another process. // Coverage notes - it would be too difficult to automate a test // for this since usually a death message is used to clean up a // generic SQL process' exit. But this code has been unit tested // using gdb sessions on the generic process and on this MXSSMP // process. - if (checkForSemaphoreHolders) - { - if (SemHeldTooManySeconds < 0) - { - // call getenv once per process - char *shtms = getenv("MXSSMP_SEM_RELEASE_SECONDS"); - if (shtms) - { - SemHeldTooManySeconds = str_atoi(shtms, str_len(shtms)); - if (SemHeldTooManySeconds < 1) - SemHeldTooManySeconds = 10; - } - else + Int32 lockedTimeInSecs = 0; + timespec ts; + if (SemHeldTooManySeconds < 0) + { + // call getenv once per process + char *shtms = getenv("MXSSMP_SEM_RELEASE_SECONDS"); + if (shtms) + { + SemHeldTooManySeconds = str_atoi(shtms, str_len(shtms)); + if (SemHeldTooManySeconds < 1) SemHeldTooManySeconds = 10; - } - - timespec ts; - clock_gettime(CLOCK_REALTIME, &ts); - if ((ts.tv_sec - lockingTimestamp_.tv_sec) >= SemHeldTooManySeconds) - cleanupSemaphore = TRUE; - } - else - cleanupSemaphore = TRUE; - if (cleanupSemaphore) - { + } + else + SemHeldTooManySeconds = 10; + char *dumpRmsDeadLockProcessEnv = getenv("RMS_DEADLOCK_PROCESS_DUMP"); + if ((dumpRmsDeadLockProcessEnv != NULL) && + (strcmp(dumpRmsDeadLockProcessEnv, "N") == 0)) + dumpRmsDeadLockProcess = false; + } + clock_gettime(CLOCK_REALTIME, &ts); + lockedTimeInSecs = ts.tv_sec - lockingTimestamp_.tv_sec; + if (checkForSemaphoreHolders) + { + if (lockedTimeInSecs >= SemHeldTooManySeconds) + cleanupSemaphore = TRUE; + } + else + cleanupSemaphore = TRUE; + if (cleanupSemaphore) + { NAProcessHandle myPhandle; myPhandle.getmine(); myPhandle.decompose(); @@ -398,6 +410,16 @@ void StatsGlobals::cleanupDanglingSemaphore(NABoolean checkForSemaphoreHolders) } else { + char msg[256]; + if (semPid_ != -1 && lockedTimeInSecs >= SemHeldTooManySeconds) { + snprintf(msg, sizeof(msg), "Pid %d, %d held semaphore for more than %d seconds ", + cpu_, semPid_, lockedTimeInSecs); + SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, msg, 0); + if (dumpRmsDeadLockProcess) + msg_mon_dump_process_id(NULL, cpu_, semPid_, coreFile); + } + +/* // Is this the same incarnation of the process name? // Do not be fooled by pid recycle. MS_Mon_Process_Info_Type processInfo; @@ -411,6 +433,7 @@ void StatsGlobals::cleanupDanglingSemaphore(NABoolean checkForSemaphoreHolders) seabedPidRecycle_ = true; semHoldingProcessExists = false; } +*/ } if (!semHoldingProcessExists) { @@ -1483,16 +1506,22 @@ void StatsGlobals::cleanup_SQL( removeProcess(pidToCleanup); releaseStatsSemaphore(semId, myPid); + logProcessDeath(cpu_, pidToCleanup, "Clean dangling semaphore"); } void StatsGlobals::verifyAndCleanup(pid_t pidThatDied, SB_Int64_Type seqNum) { + bool processRemoved = false; int error = getStatsSemaphore(ssmpProcSemId_, getSsmpPid()); if (statsArray_ && (statsArray_[pidThatDied].processId_ == pidThatDied) && - (statsArray_[pidThatDied].phandleSeqNum_ == seqNum)) - removeProcess(pidThatDied); + (statsArray_[pidThatDied].phandleSeqNum_ == seqNum)) { + removeProcess(pidThatDied); + processRemoved = true; + } releaseStatsSemaphore(ssmpProcSemId_, getSsmpPid()); + if (processRemoved) + logProcessDeath(cpu_, pidThatDied, "Received death message"); } void StatsGlobals::updateMemStats(pid_t pid, @@ -1631,6 +1660,16 @@ NABoolean StatsGlobals::getInitError(pid_t pid, NABoolean &reportError) return retcode; } +void StatsGlobals::logProcessDeath(short cpu, pid_t pid, const char *reason) +{ + char msg[256]; + + snprintf(msg, sizeof(msg), + "Pid %d,%d de-registered from shared segment. Reason: %s ", cpu, pid, reason); + SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, msg, 0); + return; +} + short getMasterCpu(char *uniqueStmtId, Lng32 uniqueStmtIdLen, char *nodeName, short maxLen, short &cpu) { Int32 nodeNumber = 0; http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/runtimestats/SqlStats.h ---------------------------------------------------------------------- diff --git a/core/sql/runtimestats/SqlStats.h b/core/sql/runtimestats/SqlStats.h index 1cfc18f..77721ec 100644 --- a/core/sql/runtimestats/SqlStats.h +++ b/core/sql/runtimestats/SqlStats.h @@ -329,7 +329,7 @@ class StatsGlobals public: StatsGlobals(void *baseAddr, short envType, Lng32 maxSegSize); static void* operator new (size_t size, void* loc = 0); - void addProcess(pid_t pid, NAHeap *heap); + bool addProcess(pid_t pid, NAHeap *heap); void removeProcess(pid_t pid, NABoolean calledDuringAdd = FALSE); ProcessStats *checkProcess(pid_t pid); void setStatsArea(pid_t pid, ExStatisticsArea *stats) @@ -346,6 +346,8 @@ public: return NULL; } + void logProcessDeath(short cpu, pid_t pid, const char *reason); + StmtStats *addQuery(pid_t pid, char *queryId, Lng32 queryIdLen, void *backRef, Lng32 fragId, char *sourceStr = NULL, Lng32 sourceStrLen = 0, NABoolean isMaster = FALSE); http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/runtimestats/sscpipc.cpp ---------------------------------------------------------------------- diff --git a/core/sql/runtimestats/sscpipc.cpp b/core/sql/runtimestats/sscpipc.cpp index ccb8dbb..8ec84ef 100755 --- a/core/sql/runtimestats/sscpipc.cpp +++ b/core/sql/runtimestats/sscpipc.cpp @@ -93,7 +93,9 @@ SscpGlobals::SscpGlobals(NAHeap *sscpheap, StatsGlobals *statsGlobals) NAHeap("Process Stats Heap", statsGlobals_->getStatsHeap(), 8192, 0); - statsGlobals_->addProcess(myPin_, statsHeap); + bool reincarnated = statsGlobals_->addProcess(myPin_, statsHeap); + if (reincarnated) + statsGlobals_->logProcessDeath(myCpu, myPin_, "Process reincarnated before RIP"); statsGlobals_->releaseStatsSemaphore(semId_, myPin_); CliGlobals *cliGlobals = GetCliGlobals();
