At the beginning of each maintenance round, collect the
data for the self-diagnose collector from each node. If
the state is not Ok, and the incident is not known already,
register it. Note that we poll from each online node even
if an incident is known already, as the state of the node
might have gotten worse.

Signed-off-by: Klaus Aehlig <[email protected]>
---
 Makefile.am                           |   1 +
 src/Ganeti/MaintD/CollectIncidents.hs | 129 ++++++++++++++++++++++++++++++++++
 src/Ganeti/MaintD/Server.hs           |   2 +
 3 files changed, 132 insertions(+)
 create mode 100644 src/Ganeti/MaintD/CollectIncidents.hs

diff --git a/Makefile.am b/Makefile.am
index 87b2cc2..c7b7f15 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -980,6 +980,7 @@ HS_LIB_SRCS = \
        src/Ganeti/Luxi.hs \
        src/Ganeti/MaintD/Autorepairs.hs \
        src/Ganeti/MaintD/Balance.hs \
+       src/Ganeti/MaintD/CollectIncidents.hs \
         src/Ganeti/MaintD/MemoryState.hs \
        src/Ganeti/MaintD/Server.hs \
        src/Ganeti/MaintD/Utils.hs \
diff --git a/src/Ganeti/MaintD/CollectIncidents.hs 
b/src/Ganeti/MaintD/CollectIncidents.hs
new file mode 100644
index 0000000..706ee71
--- /dev/null
+++ b/src/Ganeti/MaintD/CollectIncidents.hs
@@ -0,0 +1,129 @@
+{-| Discovery of incidents by the maintenance daemon.
+
+This module implements the querying of all monitoring
+daemons for the value of the node-status data collector.
+Any new incident gets registered.
+
+-}
+
+{-
+
+Copyright (C) 2015 Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-}
+
+module Ganeti.MaintD.CollectIncidents
+  ( collectIncidents
+  ) where
+
+import Control.Applicative (liftA2)
+import Control.Monad (unless)
+import Control.Monad.IO.Class (liftIO)
+import Data.IORef (IORef)
+import Network.Curl
+import System.Time (getClockTime)
+import qualified Text.JSON as J
+
+import Ganeti.BasicTypes (ResultT)
+import qualified Ganeti.Constants as C
+import qualified Ganeti.DataCollectors.Diagnose as D
+import Ganeti.DataCollectors.Types (getCategoryName)
+import qualified Ganeti.HTools.Container as Container
+import qualified Ganeti.HTools.Node as Node
+import Ganeti.Logging.Lifted
+import Ganeti.MaintD.MemoryState (MemoryState, getIncidents, updateIncident)
+import Ganeti.Objects.Maintenance
+import Ganeti.Utils (newUUID)
+
+-- | Query a node, unless it is offline, and return
+-- the paylod of the report, if available. For offline
+-- nodes return nothing.
+queryStatus :: Node.Node -> IO (Maybe J.JSValue)
+queryStatus node = do
+  let name = Node.name node
+  let url = name ++ ":" ++ show C.defaultMondPort
+            ++ "/1/report/" ++ maybe "default" getCategoryName D.dcCategory
+            ++ "/" ++ D.dcName
+  if Node.offline node
+    then do
+      logDebug $ "Not asking " ++ name ++ "; it is offline"
+      return Nothing
+    else do
+      (code, body) <- liftIO $ curlGetString url []
+      case code of
+        CurlOK ->
+          case J.decode body of
+            J.Ok r -> return $ Just r
+            _ -> return Nothing
+        _ -> do
+          logWarning $ "Failed to contact " ++ name
+          return Nothing
+
+-- | Update the status of one node.
+updateNode :: IORef MemoryState -> Node.Node -> ResultT String IO ()
+updateNode memstate node = do
+  let name = Node.name node
+  logDebug $ "Inspecting " ++ name
+  report <- liftIO $ queryStatus node
+  case report of
+    Just (J.JSObject obj)
+      | Just orig@(J.JSObject origobj) <- lookup "data" $ J.fromJSObject obj,
+        Just s <- lookup "status" $ J.fromJSObject origobj,
+        J.Ok state <- J.readJSON s,
+        state /= RANoop -> do
+          let origs = J.encode orig
+          logDebug $ "Relvant event on " ++ name ++ ": " ++ origs
+          incidents <- getIncidents memstate
+          unless (any (liftA2 (&&)
+                        ((==) name . incidentNode)
+                        ((==) orig . incidentOriginal)) incidents) $ do
+            logInfo $ "Registering new incident on " ++ name ++ ": " ++ origs
+            uuid <- liftIO newUUID
+            now <- liftIO getClockTime
+            let tag = "maintd:repairready:" ++ uuid
+                incident = Incident { incidentOriginal = orig
+                                    , incidentAction = state
+                                    , incidentRepairStatus = RSNoted
+                                    , incidentJobs = []
+                                    , incidentNode = name
+                                    , incidentTag = tag
+                                    , incidentUuid = uuid
+                                    , incidentCtime = now
+                                    , incidentMtime = now
+                                    , incidentSerial = 1
+                                    }
+            liftIO $ updateIncident memstate incident
+    _ -> return ()
+
+
+-- | Query all MonDs for updates on the node-status.
+collectIncidents :: IORef MemoryState -> Node.List -> ResultT String IO ()
+collectIncidents memstate nl = do
+  _ <- getIncidents memstate -- always update the memory state,
+                             -- even if we do not observe anything
+  logDebug "Querying all nodes for incidents"
+  mapM_ (updateNode memstate) $ Container.elems nl
diff --git a/src/Ganeti/MaintD/Server.hs b/src/Ganeti/MaintD/Server.hs
index d634215..d3657cd 100644
--- a/src/Ganeti/MaintD/Server.hs
+++ b/src/Ganeti/MaintD/Server.hs
@@ -69,6 +69,7 @@ import Ganeti.Logging.Lifted
 import qualified Ganeti.Luxi as L
 import Ganeti.MaintD.Autorepairs (harepTasks)
 import Ganeti.MaintD.Balance (balanceTask)
+import Ganeti.MaintD.CollectIncidents (collectIncidents)
 import Ganeti.MaintD.MemoryState
 import qualified Ganeti.Path as Path
 import Ganeti.Runtime (GanetiDaemon(GanetiMaintd))
@@ -136,6 +137,7 @@ maintenance memstate = do
   let il = cdInstances cData
       nl = cdNodes cData
       nidxs = Set.fromList $ Container.keys nl
+  collectIncidents memstate nl
   (nidxs', jobs) <- harepTasks (nl, il) nidxs
   unless (null jobs)
    . liftIO $ appendJobs memstate jobs
-- 
2.5.0.rc2.392.g76e840b

Reply via email to