Re: [PULL 132/136] mem-prealloc: optimize large guest startup
On 16/03/20 09:42, Laurent Vivier wrote: > Hi, > > a bug has been reported in launchpad for this patch: > > [Regression]Powerpc kvm guest unable to start with hugepage backed > memory > https://bugs.launchpad.net/qemu/+bug/1866962 Indeed, I'm sending the pull request with the fix today. Sorry for the breakage. Paolo
Re: [PULL 132/136] mem-prealloc: optimize large guest startup
Hi, a bug has been reported in launchpad for this patch: [Regression]Powerpc kvm guest unable to start with hugepage backed memory https://bugs.launchpad.net/qemu/+bug/1866962 Thanks, Laurent Le 25/02/2020 à 13:07, Paolo Bonzini a écrit : > From: bauerchen > > [desc]: > Large memory VM starts slowly when using -mem-prealloc, and > there are some areas to optimize in current method; > > 1、mmap will be used to alloc threads stack during create page > clearing threads, and it will attempt mm->mmap_sem for write > lock, but clearing threads have hold read lock, this competition > will cause threads createion very slow; > > 2、methods of calcuating pages for per threads is not well;if we use > 64 threads to split 160 hugepage,63 threads clear 2page,1 thread > clear 34 page,so the entire speed is very slow; > > to solve the first problem,we add a mutex in thread function,and > start all threads when all threads finished createion; > and the second problem, we spread remainder to other threads,in > situation that 160 hugepage and 64 threads, there are 32 threads > clear 3 pages,and 32 threads clear 2 pages. > > [test]: > 320G 84c VM start time can be reduced to 10s > 680G 84c VM start time can be reduced to 18s > > Signed-off-by: bauerchen > Reviewed-by: Pan Rui > Reviewed-by: Ivan Ren > [Simplify computation of the number of pages per thread. - Paolo] > Signed-off-by: Paolo Bonzini > --- > util/oslib-posix.c | 32 > 1 file changed, 24 insertions(+), 8 deletions(-) > > diff --git a/util/oslib-posix.c b/util/oslib-posix.c > index 5a291cc..897e8f3 100644 > --- a/util/oslib-posix.c > +++ b/util/oslib-posix.c > @@ -76,6 +76,10 @@ static MemsetThread *memset_thread; > static int memset_num_threads; > static bool memset_thread_failed; > > +static QemuMutex page_mutex; > +static QemuCond page_cond; > +static bool threads_created_flag; > + > int qemu_get_thread_id(void) > { > #if defined(__linux__) > @@ -403,6 +407,17 @@ static void *do_touch_pages(void *arg) > MemsetThread *memset_args = (MemsetThread *)arg; > sigset_t set, oldset; > > +/* > + * On Linux, the page faults from the loop below can cause mmap_sem > + * contention with allocation of the thread stacks. Do not start > + * clearing until all threads have been created. > + */ > +qemu_mutex_lock(_mutex); > +while(!threads_created_flag){ > +qemu_cond_wait(_cond, _mutex); > +} > +qemu_mutex_unlock(_mutex); > + > /* unblock SIGBUS */ > sigemptyset(); > sigaddset(, SIGBUS); > @@ -451,27 +466,28 @@ static inline int get_memset_num_threads(int smp_cpus) > static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages, > int smp_cpus) > { > -size_t numpages_per_thread; > -size_t size_per_thread; > +size_t numpages_per_thread, leftover; > char *addr = area; > int i = 0; > > memset_thread_failed = false; > +threads_created_flag = false; > memset_num_threads = get_memset_num_threads(smp_cpus); > memset_thread = g_new0(MemsetThread, memset_num_threads); > -numpages_per_thread = (numpages / memset_num_threads); > -size_per_thread = (hpagesize * numpages_per_thread); > +numpages_per_thread = numpages / memset_num_threads; > +leftover = numpages % memset_num_threads; > for (i = 0; i < memset_num_threads; i++) { > memset_thread[i].addr = addr; > -memset_thread[i].numpages = (i == (memset_num_threads - 1)) ? > -numpages : numpages_per_thread; > +memset_thread[i].numpages = numpages_per_thread + (i < leftover); > memset_thread[i].hpagesize = hpagesize; > qemu_thread_create(_thread[i].pgthread, "touch_pages", > do_touch_pages, _thread[i], > QEMU_THREAD_JOINABLE); > -addr += size_per_thread; > -numpages -= numpages_per_thread; > +addr += memset_thread[i].numpages * hpagesize; > } > +threads_created_flag = true; > +qemu_cond_broadcast(_cond); > + > for (i = 0; i < memset_num_threads; i++) { > qemu_thread_join(_thread[i].pgthread); > } >
[PULL 132/136] mem-prealloc: optimize large guest startup
From: bauerchen [desc]: Large memory VM starts slowly when using -mem-prealloc, and there are some areas to optimize in current method; 1、mmap will be used to alloc threads stack during create page clearing threads, and it will attempt mm->mmap_sem for write lock, but clearing threads have hold read lock, this competition will cause threads createion very slow; 2、methods of calcuating pages for per threads is not well;if we use 64 threads to split 160 hugepage,63 threads clear 2page,1 thread clear 34 page,so the entire speed is very slow; to solve the first problem,we add a mutex in thread function,and start all threads when all threads finished createion; and the second problem, we spread remainder to other threads,in situation that 160 hugepage and 64 threads, there are 32 threads clear 3 pages,and 32 threads clear 2 pages. [test]: 320G 84c VM start time can be reduced to 10s 680G 84c VM start time can be reduced to 18s Signed-off-by: bauerchen Reviewed-by: Pan Rui Reviewed-by: Ivan Ren [Simplify computation of the number of pages per thread. - Paolo] Signed-off-by: Paolo Bonzini --- util/oslib-posix.c | 32 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/util/oslib-posix.c b/util/oslib-posix.c index 5a291cc..897e8f3 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -76,6 +76,10 @@ static MemsetThread *memset_thread; static int memset_num_threads; static bool memset_thread_failed; +static QemuMutex page_mutex; +static QemuCond page_cond; +static bool threads_created_flag; + int qemu_get_thread_id(void) { #if defined(__linux__) @@ -403,6 +407,17 @@ static void *do_touch_pages(void *arg) MemsetThread *memset_args = (MemsetThread *)arg; sigset_t set, oldset; +/* + * On Linux, the page faults from the loop below can cause mmap_sem + * contention with allocation of the thread stacks. Do not start + * clearing until all threads have been created. + */ +qemu_mutex_lock(_mutex); +while(!threads_created_flag){ +qemu_cond_wait(_cond, _mutex); +} +qemu_mutex_unlock(_mutex); + /* unblock SIGBUS */ sigemptyset(); sigaddset(, SIGBUS); @@ -451,27 +466,28 @@ static inline int get_memset_num_threads(int smp_cpus) static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages, int smp_cpus) { -size_t numpages_per_thread; -size_t size_per_thread; +size_t numpages_per_thread, leftover; char *addr = area; int i = 0; memset_thread_failed = false; +threads_created_flag = false; memset_num_threads = get_memset_num_threads(smp_cpus); memset_thread = g_new0(MemsetThread, memset_num_threads); -numpages_per_thread = (numpages / memset_num_threads); -size_per_thread = (hpagesize * numpages_per_thread); +numpages_per_thread = numpages / memset_num_threads; +leftover = numpages % memset_num_threads; for (i = 0; i < memset_num_threads; i++) { memset_thread[i].addr = addr; -memset_thread[i].numpages = (i == (memset_num_threads - 1)) ? -numpages : numpages_per_thread; +memset_thread[i].numpages = numpages_per_thread + (i < leftover); memset_thread[i].hpagesize = hpagesize; qemu_thread_create(_thread[i].pgthread, "touch_pages", do_touch_pages, _thread[i], QEMU_THREAD_JOINABLE); -addr += size_per_thread; -numpages -= numpages_per_thread; +addr += memset_thread[i].numpages * hpagesize; } +threads_created_flag = true; +qemu_cond_broadcast(_cond); + for (i = 0; i < memset_num_threads; i++) { qemu_thread_join(_thread[i].pgthread); } -- 1.8.3.1