From: Michal Hocko <[email protected]>

We have a hardcoded 120s timeout after which the memory offline fails
basically since the hot remove has been introduced. This is essentially
a policy implemented in the kernel. Moreover there is no way to adjust
the timeout and so we are sometimes facing memory offline failures if
the system is under a heavy memory pressure or very intensive CPU
workload on large machines.

It is not very clear what purpose the timeout actually serves. The
offline operation is interruptible by a signal so if userspace wants
some timeout based termination this can be done trivially by sending a
signal.

If there is a strong usecase to do this from the kernel then we should
do it properly and have a it tunable from the userspace with the timeout
disabled by default along with the explanation who uses it and for what
purporse.

Acked-by: Vlastimil Babka <[email protected]>
Signed-off-by: Michal Hocko <[email protected]>
---
 mm/memory_hotplug.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c9dcbe6d2ac6..b8a85c11360e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1593,9 +1593,9 @@ static void node_states_clear_node(int node, struct 
memory_notify *arg)
 }
 
 static int __ref __offline_pages(unsigned long start_pfn,
-                 unsigned long end_pfn, unsigned long timeout)
+                 unsigned long end_pfn)
 {
-       unsigned long pfn, nr_pages, expire;
+       unsigned long pfn, nr_pages;
        long offlined_pages;
        int ret, node;
        unsigned long flags;
@@ -1633,12 +1633,8 @@ static int __ref __offline_pages(unsigned long start_pfn,
                goto failed_removal;
 
        pfn = start_pfn;
-       expire = jiffies + timeout;
 repeat:
        /* start memory hot removal */
-       ret = -EBUSY;
-       if (time_after(jiffies, expire))
-               goto failed_removal;
        ret = -EINTR;
        if (signal_pending(current))
                goto failed_removal;
@@ -1711,7 +1707,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
 /* Must be protected by mem_hotplug_begin() or a device_lock */
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
-       return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
+       return __offline_pages(start_pfn, start_pfn + nr_pages);
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
-- 
2.14.1

Reply via email to