Add a 'detectBroken' function that determines whether an instance is in an
unhealthy state, and what's needed to repair it. The repair is specified as
an AutoRepairType constant, and a list of opcodes. The opcodes will only be
executed (in following patches) if the repair type is allowed by the policy.

Signed-off-by: Dato Simó <d...@google.com>
---
 src/Ganeti/HTools/Program/Harep.hs | 90 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 87 insertions(+), 3 deletions(-)

diff --git a/src/Ganeti/HTools/Program/Harep.hs 
b/src/Ganeti/HTools/Program/Harep.hs
index c3cfadf..82ae63a 100644
--- a/src/Ganeti/HTools/Program/Harep.hs
+++ b/src/Ganeti/HTools/Program/Harep.hs
@@ -41,6 +41,7 @@ import Ganeti.Common
 import Ganeti.Errors
 import Ganeti.Jobs
 import Ganeti.OpCodes
+import Ganeti.OpParams
 import Ganeti.Types
 import Ganeti.Utils
 import qualified Ganeti.Constants as C
@@ -52,6 +53,7 @@ import Ganeti.HTools.ExtLoader
 import Ganeti.HTools.Types
 import qualified Ganeti.HTools.Container as Container
 import qualified Ganeti.HTools.Instance as Instance
+import qualified Ganeti.HTools.Node as Node
 
 -- | Options list and functions.
 options :: IO [OptType]
@@ -265,6 +267,85 @@ commitChange client instData = do
 
   return instData { tagsToRemove = [] }
 
+-- | Detect brokeness with an instance and suggest repair type and jobs to run.
+detectBroken :: Node.List -> Instance.Instance
+             -> Maybe (AutoRepairType, [OpCode])
+detectBroken nl inst =
+  let disk = Instance.diskTemplate inst
+      iname = Instance.name inst
+      offPri = Node.offline $ Container.find (Instance.pNode inst) nl
+      offSec = Node.offline $ Container.find (Instance.sNode inst) nl
+  in
+   case disk of
+     DTDrbd8
+       | offPri && offSec ->
+         Just (
+           ArReinstall,
+           [ OpInstanceRecreateDisks { opInstanceName = iname
+                                     , opRecreateDisksInfo = RecreateDisksAll
+                                     , opNodes = []
+                                       -- FIXME: there should be a better way 
to
+                                       -- specify opcode paramteres than 
abusing
+                                       -- mkNonEmpty in this way (using the 
fact
+                                       -- that Maybe is used both for optional
+                                       -- fields, and to express failure).
+                                     , opIallocator = mkNonEmpty "hail"
+                                     }
+           , OpInstanceReinstall { opInstanceName = iname
+                                 , opOsType = Nothing
+                                 , opTempOsParams = Nothing
+                                 , opForceVariant = False
+                                 }
+           ])
+       | offPri ->
+         Just (
+           ArFailover,
+           [ OpInstanceFailover { opInstanceName = iname
+                                  -- FIXME: ditto, see above.
+                                , opShutdownTimeout = fromJust $ mkNonNegative
+                                                      C.defaultShutdownTimeout
+                                , opIgnoreConsistency = False
+                                , opTargetNode = Nothing
+                                , opIgnoreIpolicy = False
+                                , opIallocator = Nothing
+                                }
+           ])
+       | offSec ->
+         Just (
+           ArFixStorage,
+           [ OpInstanceReplaceDisks { opInstanceName = iname
+                                    , opReplaceDisksMode = ReplaceNewSecondary
+                                    , opReplaceDisksList = []
+                                    , opRemoteNode = Nothing
+                                      -- FIXME: ditto, see above.
+                                    , opIallocator = mkNonEmpty "hail"
+                                    , opEarlyRelease = False
+                                    , opIgnoreIpolicy = False
+                                    }
+            ])
+       | otherwise -> Nothing
+
+     DTPlain
+       | offPri ->
+         Just (
+           ArReinstall,
+           [ OpInstanceRecreateDisks { opInstanceName = iname
+                                     , opRecreateDisksInfo = RecreateDisksAll
+                                     , opNodes = []
+                                       -- FIXME: ditto, see above.
+                                     , opIallocator = mkNonEmpty "hail"
+                                     }
+           , OpInstanceReinstall { opInstanceName = iname
+                                 , opOsType = Nothing
+                                 , opTempOsParams = Nothing
+                                 , opForceVariant = False
+                                 }
+           ])
+       | otherwise -> Nothing
+
+     _ -> Nothing  -- Other cases are unimplemented for now: DTDiskless,
+                   -- DTFile, DTSharedFile, DTBlock, DTRbd, DTExt.
+
 -- | Main function.
 main :: Options -> [String] -> IO ()
 main opts args = do
@@ -276,13 +357,16 @@ main opts args = do
   when (isNothing lsock) $
     exitErr "Luxi socket (-L) required to execute jobs."
 
-  (ClusterData _ _ il _ _) <- loadExternalData opts
+  (ClusterData _ nl il _ _) <- loadExternalData opts
 
   let iniData = map setInitialState $ Container.elems il
       master = fromJust lsock
 
   -- First step: check all pending repairs, see if they are completed.
-  _unused_iniData' <- bracket (L.getClient master) L.closeClient $
-                      forM iniData . processPending
+  iniData' <- bracket (L.getClient master) L.closeClient $
+              forM iniData . processPending
+
+  -- Second step: detect any problems.
+  let _unused_repairs = map (detectBroken nl . arInstance) iniData'
 
   return ()
-- 
1.8.0.2-x20-1

Reply via email to