Add a 'detectBroken' function that determines whether an instance is in an unhealthy state, and what's needed to repair it. The repair is specified as an AutoRepairType constant, and a list of opcodes. The opcodes will only be executed (in following patches) if the repair type is allowed by the policy.
Signed-off-by: Dato Simó <d...@google.com> --- src/Ganeti/HTools/Program/Harep.hs | 90 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/src/Ganeti/HTools/Program/Harep.hs b/src/Ganeti/HTools/Program/Harep.hs index c3cfadf..82ae63a 100644 --- a/src/Ganeti/HTools/Program/Harep.hs +++ b/src/Ganeti/HTools/Program/Harep.hs @@ -41,6 +41,7 @@ import Ganeti.Common import Ganeti.Errors import Ganeti.Jobs import Ganeti.OpCodes +import Ganeti.OpParams import Ganeti.Types import Ganeti.Utils import qualified Ganeti.Constants as C @@ -52,6 +53,7 @@ import Ganeti.HTools.ExtLoader import Ganeti.HTools.Types import qualified Ganeti.HTools.Container as Container import qualified Ganeti.HTools.Instance as Instance +import qualified Ganeti.HTools.Node as Node -- | Options list and functions. options :: IO [OptType] @@ -265,6 +267,85 @@ commitChange client instData = do return instData { tagsToRemove = [] } +-- | Detect brokeness with an instance and suggest repair type and jobs to run. +detectBroken :: Node.List -> Instance.Instance + -> Maybe (AutoRepairType, [OpCode]) +detectBroken nl inst = + let disk = Instance.diskTemplate inst + iname = Instance.name inst + offPri = Node.offline $ Container.find (Instance.pNode inst) nl + offSec = Node.offline $ Container.find (Instance.sNode inst) nl + in + case disk of + DTDrbd8 + | offPri && offSec -> + Just ( + ArReinstall, + [ OpInstanceRecreateDisks { opInstanceName = iname + , opRecreateDisksInfo = RecreateDisksAll + , opNodes = [] + -- FIXME: there should be a better way to + -- specify opcode paramteres than abusing + -- mkNonEmpty in this way (using the fact + -- that Maybe is used both for optional + -- fields, and to express failure). + , opIallocator = mkNonEmpty "hail" + } + , OpInstanceReinstall { opInstanceName = iname + , opOsType = Nothing + , opTempOsParams = Nothing + , opForceVariant = False + } + ]) + | offPri -> + Just ( + ArFailover, + [ OpInstanceFailover { opInstanceName = iname + -- FIXME: ditto, see above. + , opShutdownTimeout = fromJust $ mkNonNegative + C.defaultShutdownTimeout + , opIgnoreConsistency = False + , opTargetNode = Nothing + , opIgnoreIpolicy = False + , opIallocator = Nothing + } + ]) + | offSec -> + Just ( + ArFixStorage, + [ OpInstanceReplaceDisks { opInstanceName = iname + , opReplaceDisksMode = ReplaceNewSecondary + , opReplaceDisksList = [] + , opRemoteNode = Nothing + -- FIXME: ditto, see above. + , opIallocator = mkNonEmpty "hail" + , opEarlyRelease = False + , opIgnoreIpolicy = False + } + ]) + | otherwise -> Nothing + + DTPlain + | offPri -> + Just ( + ArReinstall, + [ OpInstanceRecreateDisks { opInstanceName = iname + , opRecreateDisksInfo = RecreateDisksAll + , opNodes = [] + -- FIXME: ditto, see above. + , opIallocator = mkNonEmpty "hail" + } + , OpInstanceReinstall { opInstanceName = iname + , opOsType = Nothing + , opTempOsParams = Nothing + , opForceVariant = False + } + ]) + | otherwise -> Nothing + + _ -> Nothing -- Other cases are unimplemented for now: DTDiskless, + -- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt. + -- | Main function. main :: Options -> [String] -> IO () main opts args = do @@ -276,13 +357,16 @@ main opts args = do when (isNothing lsock) $ exitErr "Luxi socket (-L) required to execute jobs." - (ClusterData _ _ il _ _) <- loadExternalData opts + (ClusterData _ nl il _ _) <- loadExternalData opts let iniData = map setInitialState $ Container.elems il master = fromJust lsock -- First step: check all pending repairs, see if they are completed. - _unused_iniData' <- bracket (L.getClient master) L.closeClient $ - forM iniData . processPending + iniData' <- bracket (L.getClient master) L.closeClient $ + forM iniData . processPending + + -- Second step: detect any problems. + let _unused_repairs = map (detectBroken nl . arInstance) iniData' return () -- 1.8.0.2-x20-1