The data collectors should be able to provide as much information as possible even when the system is badly degraded. This patch modifies the instance status collector for xen so that it can keep providing as much data as possible, even when some of the queries it performs fail, by removing exitIfBad calls and substituting them with logging and returning empty fields instead.
Signed-off-by: Michele Tartara <[email protected]> --- src/Ganeti/DataCollectors/InstStatus.hs | 20 ++++++++++++++------ src/Ganeti/Hypervisor/Xen.hs | 30 +++++++++++++++++++++--------- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/src/Ganeti/DataCollectors/InstStatus.hs b/src/Ganeti/DataCollectors/InstStatus.hs index 65fc20c..0430274 100644 --- a/src/Ganeti/DataCollectors/InstStatus.hs +++ b/src/Ganeti/DataCollectors/InstStatus.hs @@ -43,6 +43,7 @@ import qualified Data.Map as Map import Network.BSD (getHostName) import qualified Text.JSON as J +import Ganeti.BasicTypes as BT import Ganeti.Confd.ClientFunctions import Ganeti.Common import Ganeti.DataCollectors.CLI @@ -181,12 +182,19 @@ buildInstStatusReport srvAddr srvPort = do node <- getHostName answer <- getInstances node srvAddr srvPort inst <- exitIfBad "Can't get instance info from ConfD" answer - domains <- getInferredDomInfo - uptimes <- getUptimeInfo - let primaryInst = fst inst - iStatus <- mapM (buildStatus domains uptimes) primaryInst - let globalStatus = computeGlobalStatus iStatus - jsonReport = J.showJSON $ ReportData iStatus globalStatus + d <- getInferredDomInfo + reportData <- + case d of + BT.Ok domains -> do + uptimes <- getUptimeInfo + let primaryInst = fst inst + iStatus <- mapM (buildStatus domains uptimes) primaryInst + let globalStatus = computeGlobalStatus iStatus + return $ ReportData iStatus globalStatus + BT.Bad m -> + (return . ReportData []) . DCStatus DCSCBad $ + "Unable to receive the list of instances: " ++ m + let jsonReport = J.showJSON reportData buildReport dcName dcVersion dcFormatVersion dcCategory dcKind jsonReport -- | Main function. diff --git a/src/Ganeti/Hypervisor/Xen.hs b/src/Ganeti/Hypervisor/Xen.hs index 06c928e..3fbffa7 100644 --- a/src/Ganeti/Hypervisor/Xen.hs +++ b/src/Ganeti/Hypervisor/Xen.hs @@ -40,21 +40,25 @@ import qualified Ganeti.BasicTypes as BT import qualified Ganeti.Constants as C import Ganeti.Hypervisor.Xen.Types import Ganeti.Hypervisor.Xen.XmParser +import Ganeti.Logging import Ganeti.Utils -- | Get information about the current Xen domains as a map where the domain -- name is the key. This only includes the information made available by Xen -- itself. -getDomainsInfo :: IO (Map.Map String Domain) +getDomainsInfo :: IO (BT.Result (Map.Map String Domain)) getDomainsInfo = do contents <- - ((E.try $ readProcess C.xenCmdXm ["list", "--long"] "") - :: IO (Either IOError String)) >>= - exitIfBad "running command" . either (BT.Bad . show) BT.Ok - case A.parseOnly xmListParser $ pack contents of - Left msg -> exitErr msg - Right dom -> return dom + (E.try $ readProcess C.xenCmdXm ["list", "--long"] "") + :: IO (Either IOError String) + return $ + either (BT.Bad . show) ( + \c -> + case A.parseOnly xmListParser $ pack c of + Left msg -> BT.Bad msg + Right dom -> BT.Ok dom + ) contents -- | Given a domain and a map containing information about multiple domains, -- infer additional information about that domain (specifically, whether it is @@ -70,11 +74,19 @@ inferDomInfos domMap dom1 = -- name is the key. This includes information made available by Xen itself as -- well as further information that can be inferred by querying Xen multiple -- times and comparing the results. -getInferredDomInfo :: IO (Map.Map String Domain) +getInferredDomInfo :: IO (BT.Result (Map.Map String Domain)) getInferredDomInfo = do domMap1 <- getDomainsInfo domMap2 <- getDomainsInfo - return $ fmap (inferDomInfos domMap2) domMap1 + case (domMap1, domMap2) of + (BT.Bad m1, BT.Bad m2) -> return . BT.Bad $ m1 ++ "\n" ++ m2 + (BT.Bad m, BT.Ok d) -> do + logWarning $ "Unable to retrieve domains info the first time" ++ m + return $ BT.Ok d + (BT.Ok d, BT.Bad m) -> do + logWarning $ "Unable to retrieve domains info the second time" ++ m + return $ BT.Ok d + (BT.Ok d1, BT.Ok d2) -> return . BT.Ok $ fmap (inferDomInfos d2) d1 -- | Get information about the uptime of domains, as a map where the domain ID -- is the key. -- 1.7.10.4
