The existing memory model in htools was reporting incorrect node memory, missing memory and also available (free) memory on KVM that resulted in some cases in overcramming nodes (overcommiting memory without having swap to evict pages from RAM) leading to OOM kills.
This model is equivalent to the model proposed in issue 127, but doesn't introduce overcommitment (swap or balooning) support nor does it depend on Ganeti 2.17 features. The goal is to distinguish between reported (state-of-world) values and recorded (state-of-record) values and not to mix them arbitrarily. To estimate free memory we always need to take the minimum of measured/calculated values using those two states (root causes of the differences explained in missing memory) in order to be on the safe side. Signed-off-by: Viktor Bachraty <[email protected]> --- src/Ganeti/HTools/Cluster.hs | 17 ++-- src/Ganeti/HTools/Loader.hs | 17 ++-- src/Ganeti/HTools/Node.hs | 150 +++++++++++++++++++++-------- test/hs/Test/Ganeti/HTools/Backend/Text.hs | 14 ++- test/hs/Test/Ganeti/HTools/Node.hs | 11 +-- 5 files changed, 142 insertions(+), 67 deletions(-) diff --git a/src/Ganeti/HTools/Cluster.hs b/src/Ganeti/HTools/Cluster.hs index 8e4327ce9..4276b90ea 100644 --- a/src/Ganeti/HTools/Cluster.hs +++ b/src/Ganeti/HTools/Cluster.hs @@ -226,11 +226,9 @@ updateCStats cs node = csFspn = x_fspn, csIspn = x_ispn, csTspn = x_tspn } = cs - inc_amem = Node.fMem node - Node.rMem node - inc_amem' = if inc_amem > 0 then inc_amem else 0 + inc_amem = Node.availMem node inc_adsk = Node.availDisk node - inc_imem = truncate (Node.tMem node) - Node.nMem node - - Node.xMem node - Node.fMem node + inc_imem = Node.iMem node inc_icpu = Node.uCpu node inc_idsk = truncate (Node.tDsk node) - Node.fDsk node inc_ispn = Node.tSpindles node - Node.fSpindles node @@ -238,13 +236,13 @@ updateCStats cs node = inc_acpu = Node.availCpu node inc_ncpu = fromIntegral (Node.uCpu node) / iPolicyVcpuRatio (Node.iPolicy node) - in cs { csFmem = x_fmem + fromIntegral (Node.fMem node) + in cs { csFmem = x_fmem + fromIntegral (Node.guaranteedFreeMem node) , csFdsk = x_fdsk + fromIntegral (Node.fDsk node) , csFspn = x_fspn + fromIntegral (Node.fSpindles node) - , csAmem = x_amem + fromIntegral inc_amem' + , csAmem = x_amem + fromIntegral inc_amem , csAdsk = x_adsk + fromIntegral inc_adsk , csAcpu = x_acpu + fromIntegral inc_acpu - , csMmem = max x_mmem (fromIntegral inc_amem') + , csMmem = max x_mmem (fromIntegral inc_amem) , csMdsk = max x_mdsk (fromIntegral inc_adsk) , csMcpu = max x_mcpu (fromIntegral inc_acpu) , csImem = x_imem + fromIntegral inc_imem @@ -778,9 +776,8 @@ iterateAllocSmallStep opts nl il limit newinst allocnodes ixes cstats = guessBigstepSize :: Node.List -> Instance.Instance -> Int guessBigstepSize nl inst = let nodes = Container.elems nl - totalUnusedMemory = sum $ map Node.fMem nodes - reserved = round . maximum $ map Node.tMem nodes - capacity = (totalUnusedMemory - reserved) `div` Instance.mem inst + totalAvail = sum $ map Node.availMem nodes + capacity = totalAvail `div` Instance.mem inst -- however, at every node we might lose almost an instance if it just -- doesn't fit by a tiny margin guess = capacity - Container.size nl diff --git a/src/Ganeti/HTools/Loader.hs b/src/Ganeti/HTools/Loader.hs index ed7ae22f6..b582951ab 100644 --- a/src/Ganeti/HTools/Loader.hs +++ b/src/Ganeti/HTools/Loader.hs @@ -56,6 +56,7 @@ module Ganeti.HTools.Loader , extractDesiredLocations , updateDesiredLocationTags , setStaticKvmNodeMem + , updateMemStat ) where import Control.Monad @@ -363,8 +364,9 @@ mergeData um extags selinsts exinsts time cdata@(ClusterData gl nl il ctags _) = (`Node.buildPeers` il4)) nl3 il6 = Container.map (disableSplitMoves nl3) il5 nl5 = Container.map (addMigrationTags ctags) nl4 + nl6 = Container.map (`updateMemStat` il6) nl5 in if' (null lkp_unknown) - (Ok cdata { cdNodes = nl5, cdInstances = il6 }) + (Ok cdata { cdNodes = nl6, cdInstances = il6 }) (Bad $ "Unknown instance(s): " ++ show(map lrContent lkp_unknown)) -- | In a cluster description, clear dynamic utilisation information. @@ -381,6 +383,12 @@ setStaticKvmNodeMem cdata@(ClusterData _ nl _ _ _) staticNodeMem = else Node.nMem node }) nl in Ok cdata { cdNodes = nl' } +-- | Update node memory stat based on instance list. +updateMemStat :: Node.Node -> Instance.List -> Node.Node +updateMemStat node il = + let node' = node { Node.iMem = nodeImem node il } + in node' { Node.pMem = fromIntegral (Node.guaranteedFreeMem node') / Node.tMem node' } + -- | Checks the cluster data for consistency. checkData :: Node.List -> Instance.List -> ([String], Node.List) @@ -388,14 +396,11 @@ checkData nl il = Container.mapAccum (\ msgs node -> let nname = Node.name node - delta_mem = truncate (Node.tMem node) - - Node.nMem node - - Node.fMem node - - nodeImem node il + delta_mem = Node.missingMem node delta_dsk = truncate (Node.tDsk node) - Node.fDsk node - nodeIdsk node il - newn = node `Node.setXmem` delta_mem + newn = node { Node.xMem = delta_mem } umsg1 = if delta_mem > 512 || delta_dsk > 1024 then printf "node %s is missing %d MB ram \ diff --git a/src/Ganeti/HTools/Node.hs b/src/Ganeti/HTools/Node.hs index 8b2f25958..27aa93b96 100644 --- a/src/Ganeti/HTools/Node.hs +++ b/src/Ganeti/HTools/Node.hs @@ -46,7 +46,6 @@ module Ganeti.HTools.Node , setIdx , setAlias , setOffline - , setXmem , setPri , calcFmemOfflineOrForthcoming , setSec @@ -78,8 +77,10 @@ module Ganeti.HTools.Node -- * Stats , availDisk , availMem + , missingMem + , guaranteedFreeMem + , recordedFreeMem , availCpu - , iMem , iDsk , conflictingPrimaries -- * Generate OpCodes @@ -133,11 +134,12 @@ type TagMap = Map.Map String Int data Node = Node { name :: String -- ^ The node name , alias :: String -- ^ The shortened name (for display purposes) - , tMem :: Double -- ^ Total memory (MiB) - , nMem :: Int -- ^ Node memory (MiB) - , fMem :: Int -- ^ Free memory (MiB) + , tMem :: Double -- ^ Total memory (MiB) (state-of-world) + , nMem :: Int -- ^ Node memory (MiB) (state-of-record) + , iMem :: Int -- ^ Instance memory (MiB) (state-of-record) + , fMem :: Int -- ^ Free memory (MiB) (state-of-world) , fMemForth :: Int -- ^ Free memory (MiB) including forthcoming - -- instances + -- instances TODO: Extend new memory model for forthcoming instances , xMem :: Int -- ^ Unaccounted memory (MiB) , tDsk :: Double -- ^ Total disk space (MiB) , fDsk :: Int -- ^ Free disk space (MiB) @@ -321,6 +323,9 @@ create name_init mem_t_init mem_n_init mem_f_init , alias = name_init , tMem = mem_t_init , nMem = mem_n_init + , iMem = 0 -- updated after instances are loaded + , xMem = 0 -- updated after instances are loaded + , pMem = 0 -- updated after instances are loaded , fMem = mem_f_init , fMemForth = mem_f_init , tDsk = dsk_t_init @@ -344,7 +349,6 @@ create name_init mem_t_init mem_n_init mem_f_init , peers = P.empty , rMem = 0 , rMemForth = 0 - , pMem = fromIntegral mem_f_init / mem_t_init , pMemForth = fromIntegral mem_f_init / mem_t_init , pDsk = if excl_stor then computePDsk spindles_f_init $ fromIntegral spindles_t_init @@ -360,7 +364,6 @@ create name_init mem_t_init mem_n_init mem_f_init , offline = offline_init , isMaster = False , nTags = [] - , xMem = 0 , mDsk = T.defReservedDiskRatio , loDsk = mDskToloDsk T.defReservedDiskRatio dsk_t_init , hiCpu = mCpuTohiCpu (T.iPolicyVcpuRatio T.defIPolicy) cpu_t_init @@ -431,10 +434,6 @@ setRecvMigrationTags t val = t { rmigTags = val } setLocationTags :: Node -> Set.Set String -> Node setLocationTags t val = t { locationTags = val } --- | Sets the unnaccounted memory. -setXmem :: Node -> Int -> Node -setXmem t val = t { xMem = val } - -- | Sets the hypervisor attribute. setHypervisor :: Node -> String -> Node setHypervisor t val = t { hypervisor = val } @@ -761,14 +760,16 @@ removePri t inst = then updateForthcomingFields t else let new_plist = delete iname (pList t) - new_mem = incIf (Instance.usesMemory inst) (fMem t) - (Instance.mem inst) + + (new_i_mem, new_free_mem) = perspectiveMem t inst False + new_p_mem = fromIntegral new_free_mem / tMem t + new_failn1 = new_free_mem <= rMem t + new_dsk = incIf uses_disk (fDsk t) (Instance.dsk inst) new_free_sp = calcNewFreeSpindles False t inst new_inst_sp = calcSpindleUse False t inst - new_mp = fromIntegral new_mem / tMem t new_dp = computeNewPDsk t new_free_sp new_dsk - new_failn1 = new_mem <= rMem t + new_ucpu = decIf i_online (uCpu t) (Instance.vcpus inst) new_rcpu = fromIntegral new_ucpu / tCpu t new_load = utilLoad t `T.subUtil` Instance.util inst @@ -777,8 +778,8 @@ removePri t inst = $ getLocationExclusionPairs t inst in updateForthcomingFields $ - t { pList = new_plist, fMem = new_mem, fDsk = new_dsk - , failN1 = new_failn1, pMem = new_mp, pDsk = new_dp + t { pList = new_plist, iMem = new_i_mem, fDsk = new_dsk + , failN1 = new_failn1, pMem = new_p_mem, pDsk = new_dp , uCpu = new_ucpu, pCpu = new_rcpu, utilLoad = new_load , instSpindles = new_inst_sp, fSpindles = new_free_sp , locationScore = locationScore t @@ -837,7 +838,7 @@ removeSec t inst = then old_rmem else computeMaxRes new_peers new_prem = fromIntegral new_rmem / tMem t - new_failn1 = fMem t <= new_rmem + new_failn1 = guaranteedFreeMem t <= new_rmem new_dp = computeNewPDsk t new_free_sp new_dsk old_load = utilLoad t new_load = old_load @@ -923,24 +924,23 @@ addPriEx force t inst = _ -> Ok $ updateForthcomingFields t else let - new_mem = decIf (Instance.usesMemory inst) (fMem t) - (Instance.mem inst) + (new_i_mem, new_free_mem) = perspectiveMem t inst True + new_p_mem = fromIntegral new_free_mem / tMem t + new_failn1 = new_free_mem <= rMem t + new_dsk = decIf uses_disk (fDsk t) (Instance.dsk inst) new_free_sp = calcNewFreeSpindles True t inst new_inst_sp = calcSpindleUse True t inst - new_failn1 = new_mem <= rMem t new_ucpu = incIf i_online (uCpu t) (Instance.vcpus inst) new_pcpu = fromIntegral new_ucpu / tCpu t new_dp = computeNewPDsk t new_free_sp new_dsk new_load = utilLoad t `T.addUtil` Instance.util inst new_plist = iname:pList t - new_mp = fromIntegral new_mem / tMem t - new_instance_map = addTags (instanceMap t) $ getLocationExclusionPairs t inst in case () of - _ | new_mem <= 0 -> Bad T.FailMem + _ | new_free_mem <= 0 -> Bad T.FailMem | uses_disk && new_dsk <= 0 -> Bad T.FailDisk | strict && uses_disk && new_dsk < loDsk t -> Bad T.FailDisk | uses_disk && exclStorage t && new_free_sp < 0 -> Bad T.FailSpindles @@ -955,10 +955,10 @@ addPriEx force t inst = | otherwise -> Ok . updateForthcomingFields $ t { pList = new_plist - , fMem = new_mem + , iMem = new_i_mem , fDsk = new_dsk , failN1 = new_failn1 - , pMem = new_mp + , pMem = new_p_mem , pDsk = new_dp , uCpu = new_ucpu , pCpu = new_pcpu @@ -1036,7 +1036,7 @@ addSecExEx ignore_disks force t inst pdx = _ -> Ok $ updateForthcomingFields t else let - old_mem = fMem t + old_mem = guaranteedFreeMem t new_dsk = fDsk t - Instance.dsk inst new_free_sp = calcNewFreeSpindles True t inst new_inst_sp = calcSpindleUse True t inst @@ -1095,14 +1095,86 @@ availDisk t = iDsk :: Node -> Int iDsk t = truncate (tDsk t) - fDsk t +-- | Returns state-of-world free memory on the node. +-- | NOTE: This value is valid only before placement simulations. +-- | TODO: Redefine this for memoy overcommitment. +reportedFreeMem :: Node -> Int +reportedFreeMem = fMem + +-- | Computes state-of-record free memory on the node. +-- | TODO: Redefine this for memory overcommitment. +recordedFreeMem :: Node -> Int +recordedFreeMem t = + let node = nMem t + total = tMem t + inst = iMem t + in truncate total - node - inst + +-- | Computes the amount of missing memory on the node. +-- NOTE: This formula uses free memory for calculations as opposed to +-- used_memory in the definition, that's why it is the inverse. +-- Explanations for missing memory (+) positive, (-) negative: +-- (+) instances are using more memory that state-of-record +-- - on KVM this might be due to the overhead per qemu process +-- - on Xen manually upsized domains (xen mem-set) +-- (+) on KVM non-qemu processes might be using more memory than what is +-- reserved for node (no isolation) +-- (-) on KVM qemu processes allocate memory on demand, thus an instance grows +-- over its lifetime until it reaches state-of-record (+overhead) +-- (-) on KVM KSM might be active +-- (-) on Xen manually downsized domains (xen mem-set) +missingMem :: Node -> Int +missingMem t = + let state_of_world = reportedFreeMem t + state_of_record = recordedFreeMem t + in state_of_record - state_of_world + +-- | Computes the 'guaranteed' free memory, that is the minimum of what +-- is reported by the node (available bytes) and our calculation based on +-- instance sizes (our records), thus considering missing memory. +-- NOTE 1: During placement simulations, the recorded memory changes, as +-- instances are added/removed from the node, thus we have to calculate the +-- missingMem (correction) before altering state-of-record and then +-- use that correction to estimate state-of-world memory usage _after_ +-- the placements are done rather than doing min(record, world). +-- NOTE 2: This is still only an approximation on KVM. As we shuffle instances +-- during the simulation we are considering their state-of-record size, but +-- in the real world the moves would shuffle parts of missing memory as well. +-- Unfortunately as long as we don't have a more finegrained model that can +-- better explain missing memory (split down based on root causes), we can't +-- do better. +-- NOTE 3: This is a hard limit based on available bytes and our bookkeeping. +-- In case of memory overcommitment, both recordedFreeMem and reportedFreeMem +-- would be extended by swap size on KVM or baloon size on Xen (their nominal +-- and reported values). +guaranteedFreeMem :: Node -> Int +guaranteedFreeMem t = + let state_of_record = recordedFreeMem t + in state_of_record - max 0 (missingMem t) + -- | Computes the amount of available memory on a given node. +-- Compared to guaranteedFreeMem, this takes into account also memory reserved for +-- secondary instances. +-- NOTE: In case of memory overcommitment, there would be also an additional +-- soft limit based on RAM size dedicated for instances and sum of +-- state-of-record instance sizes (iMem): (tMem - nMem)*overcommit_ratio - iMem availMem :: Node -> Int availMem t = - let _f = fMem t - _l = rMem t - in if _f < _l - then 0 - else _f - _l + let reserved = rMem t + guaranteed = guaranteedFreeMem t + in max 0 (guaranteed - reserved) + +-- | Perspective memory stats after instance operation. +perspectiveMem :: Node -> Instance + -> Bool -- ^ Operation: True if add, False for remove. + -> (Int, Int) -- ^ Tuple (memory_used_by_instances, guaranteed_free_mem) +perspectiveMem node inst add = + let uses_mem = (Instance.usesMemory inst) + condOp = if add then incIf else decIf + new_i_mem = condOp uses_mem (iMem node) (Instance.mem inst) + new_node = node { iMem = new_i_mem } + new_free_mem = guaranteedFreeMem new_node + in (new_i_mem, new_free_mem) -- | Computes the amount of available memory on a given node. availCpu :: Node -> Int @@ -1113,10 +1185,6 @@ availCpu t = then _l - _u else 0 --- | The memory used by instances on a given node. -iMem :: Node -> Int -iMem t = truncate (tMem t) - nMem t - xMem t - fMem t - -- * Node graph functions -- These functions do the transformations needed so that nodes can be -- represented as a graph connected by the instances that are replicated @@ -1194,9 +1262,10 @@ showField t field = "nmem" -> printf "%5d" $ nMem t "xmem" -> printf "%5d" $ xMem t "fmem" -> printf "%5d" $ fMem t + "gmem" -> printf "%5d" $ guaranteedFreeMem t "imem" -> printf "%5d" $ iMem t "rmem" -> printf "%5d" $ rMem t - "amem" -> printf "%5d" $ fMem t - rMem t + "amem" -> printf "%5d" $ availMem t "tdsk" -> printf "%5.0f" $ tDsk t / 1024 "fdsk" -> printf "%5d" $ fDsk t `div` 1024 "tcpu" -> printf "%4.0f" $ tCpu t @@ -1235,9 +1304,10 @@ showHeader field = "nmem" -> ("n_mem", True) "xmem" -> ("x_mem", True) "fmem" -> ("f_mem", True) + "gmem" -> ("g_mem", True) + "amem" -> ("a_mem", True) "imem" -> ("i_mem", True) "rmem" -> ("r_mem", True) - "amem" -> ("a_mem", True) "tdsk" -> ("t_dsk", True) "fdsk" -> ("f_dsk", True) "tcpu" -> ("pcpu", True) @@ -1324,7 +1394,7 @@ genAddTagsOpCode node tags = OpCodes.OpTagsSet -- | Constant holding the fields we're displaying by default. defaultFields :: [String] defaultFields = - [ "status", "name", "tmem", "nmem", "imem", "xmem", "fmem" + [ "status", "name", "tmem", "nmem", "imem", "xmem", "fmem", "gmem", "amem" , "rmem", "tdsk", "fdsk", "tcpu", "ucpu", "pcnt", "scnt" , "pfmem", "pfdsk", "rcpu" , "cload", "mload", "dload", "nload" ] diff --git a/test/hs/Test/Ganeti/HTools/Backend/Text.hs b/test/hs/Test/Ganeti/HTools/Backend/Text.hs index 7b1f48509..ebd9b5573 100644 --- a/test/hs/Test/Ganeti/HTools/Backend/Text.hs +++ b/test/hs/Test/Ganeti/HTools/Backend/Text.hs @@ -213,8 +213,11 @@ prop_NodeLSIdempotent :: Property prop_NodeLSIdempotent = forAll (genNode (Just 1) Nothing) $ \node -> -- override failN1 to what loadNode returns by default + -- override pMem as that is updated after loading in Loader.updateMemStat let n = Node.setPolicy Types.defIPolicy $ - node { Node.failN1 = True, Node.offline = False } + node { Node.failN1 = True, + Node.offline = False, + Node.pMem = 0} in (Text.loadNode defGroupAssoc. Utils.sepSplit '|' . Text.serializeNode defGroupList) n ==? @@ -276,7 +279,12 @@ prop_CreateSerialise = Ok (_, _, _, [], _) -> counterexample "Failed to allocate: no allocations" False Ok (_, nl', il, _, _) -> - let cdata = Loader.ClusterData defGroupList nl' il ctags + let + -- makeSmallCluster created an empty cluster, that had some instances allocated, + -- so we need to simulate that the hyperwisor now reports less fMem, otherwise + -- Loader.checkData will detect missing memory after deserialization. + nl1 = Container.map (\n -> n { Node.fMem = Node.recordedFreeMem n }) nl' + cdata = Loader.ClusterData defGroupList nl1 il ctags Types.defIPolicy saved = Text.serializeCluster cdata in case Text.parseData saved >>= Loader.mergeData [] [] [] [] (TOD 0 0) @@ -288,7 +296,7 @@ prop_CreateSerialise = , cpol2 ==? Types.defIPolicy , il2 ==? il , gl2 ==? defGroupList - , nl3 ==? nl' + , nl3 ==? nl1 ] testSuite "HTools/Backend/Text" diff --git a/test/hs/Test/Ganeti/HTools/Node.hs b/test/hs/Test/Ganeti/HTools/Node.hs index e7f46e2a2..59a8c1842 100644 --- a/test/hs/Test/Ganeti/HTools/Node.hs +++ b/test/hs/Test/Ganeti/HTools/Node.hs @@ -106,8 +106,9 @@ genNode min_multiplier max_multiplier = do let n = Node.create name (fromIntegral mem_t) mem_n mem_f (fromIntegral dsk_t) dsk_f (fromIntegral cpu_t) cpu_n offl spindles 0 0 False - n' = Node.setPolicy nullIPolicy n - return $ Node.buildPeers n' Container.empty + n1 = Node.setPolicy nullIPolicy n + n2 = Loader.updateMemStat n1 Container.empty + return $ Node.buildPeers n2 Container.empty -- | Helper function to generate a sane node. genOnlineNode :: Gen Node.Node @@ -204,11 +205,6 @@ prop_setOffline node status = Node.offline newnode ==? status where newnode = Node.setOffline node status -prop_setXmem :: Node.Node -> Int -> Property -prop_setXmem node xm = - Node.xMem newnode ==? xm - where newnode = Node.setXmem node xm - prop_setMcpu :: Node.Node -> Double -> Property prop_setMcpu node mc = Types.iPolicyVcpuRatio (Node.iPolicy newnode) ==? mc @@ -469,7 +465,6 @@ testSuite "HTools/Node" [ 'prop_setAlias , 'prop_setOffline , 'prop_setMcpu - , 'prop_setXmem , 'prop_addPriFM , 'prop_addPriFD , 'prop_addPriFS -- 2.12.0.367.g23dc2f6d3c-goog
