Re: [Xen-devel] [PATCH] libxl: fix dom0 balloon logic

2015-03-24 Thread Jim Fehlig
Stefano Stabellini wrote:
 On Mon, 23 Mar 2015, Ian Campbell wrote:
   
 (just ccing the other tools maintainers, in particular Stefano who knows
 what this stuff is supposed to do...)

 On Fri, 2015-03-20 at 17:10 -0600, Jim Fehlig wrote:
 
 Recent testing on large memory systems revealed a bug in the Xen xl
 tool's freemem() function.  When autoballooning is enabled, freemem()
 is used to ensure enough memory is available to start a domain,
 ballooning dom0 if necessary.  When ballooning large amounts of memory
 from dom0, freemem() would exceed its self-imposed wait time and
 return an error.  Meanwhile, dom0 continued to balloon.  Starting the
 domain later, after sufficient memory was ballooned from dom0, would
 succeed.  The libvirt implementation in libxlDomainFreeMem() suffers
 the same bug since it is modeled after freemem().

 In the end, the best place to fix the bug on the Xen side was to
 slightly change the behavior of libxl_wait_for_memory_target().
 Instead of failing after caller-provided wait_sec, the function now
 blocks as long as dom0 memory ballooning is progressing.  It will return
 failure only when more memory is needed to reach the target and wait_sec
 have expired with no progress being made.  See xen.git commit fd3aa246.
 There was a dicussion on how this would affect other libxl apps like
 libvirt

 http://lists.xen.org/archives/html/xen-devel/2015-03/msg00739.html

 If libvirt containing this patch was build against a Xen containing
 the old libxl_wait_for_memory_target() behavior, libxlDomainFreeMem()
 will fail after 30 sec and domain creation will be terminated.
 Without this patch and with old libxl_wait_for_memory_target() behavior,
 libxlDomainFreeMem() does not succeed after 30 sec, but returns success
 anyway.  Domain creation continues resulting in all sorts of fun stuff
 like cpu soft lockups in the guest OS.  It was decided to properly fix
 libxl_wait_for_memory_target(), and if anything improve the default
 behavior of apps using the freemem reference impl in xl.

 xl was patched to accommodate the change in libxl_wait_for_memory_target()
 with xen.git commit 883b30a0.  This patch does the same in the libxl
 driver.  While at it, I changed the logic to essentially match
 freemem() in $xensrc/tools/libxl/xl_cmdimpl.c.  It was a bit cleaner
 IMO and will make it easier to spot future, potentially interesting
 divergences.

 Signed-off-by: Jim Fehlig jfeh...@suse.com
 ---
  src/libxl/libxl_domain.c | 57 
 
  1 file changed, 28 insertions(+), 29 deletions(-)

 diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c
 index 407a9bd..ff78133 100644
 --- a/src/libxl/libxl_domain.c
 +++ b/src/libxl/libxl_domain.c
 @@ -1121,38 +1121,41 @@ libxlDomainFreeMem(libxlDomainObjPrivatePtr priv, 
 libxl_domain_config *d_config)
  {
  uint32_t needed_mem;
  uint32_t free_mem;
 -size_t i;
 -int ret = -1;
 +int ret;
  int tries = 3;
  int wait_secs = 10;
  
 -if ((ret = libxl_domain_need_memory(priv-ctx, d_config-b_info,
 -needed_mem)) = 0) {
 -for (i = 0; i  tries; ++i) {
 -if ((ret = libxl_get_free_memory(priv-ctx, free_mem))  0)
 -break;
 +ret = libxl_domain_need_memory(priv-ctx, d_config-b_info,
 +   needed_mem);
 +if (ret  0)
 +goto error;
  
 -if (free_mem = needed_mem) {
 -ret = 0;
 -break;
 -}
 +do {
 +ret = libxl_get_free_memory(priv-ctx, free_mem);
 +if (ret  0)
 +goto error;
  
 -if ((ret = libxl_set_memory_target(priv-ctx, 0,
 -   free_mem - needed_mem,
 -   /* relative */ 1, 0))  0)
 -break;
 +if (free_mem = needed_mem)
 +return 0;
  
 -ret = libxl_wait_for_free_memory(priv-ctx, 0, needed_mem,
 - wait_secs);
 -if (ret == 0 || ret != ERROR_NOMEM)
 -break;
 +ret = libxl_set_memory_target(priv-ctx, 0,
 +  free_mem - needed_mem,
 +  /* relative */ 1, 0);
 +if (ret  0)
 +goto error;
  
 -if ((ret = libxl_wait_for_memory_target(priv-ctx, 0, 1))  0)
 -break;
 -}
 -}
 +ret = libxl_wait_for_free_memory(priv-ctx, 0, needed_mem,
 + wait_secs);
 +if (ret  0)
 +goto error;
   

 Shouldn't this be a call to libxl_wait_for_memory_target instead?
   

Err, right. One would think I would have caught that after all the talk
about libxl_wait_for_memory_target() in the commit msg :-/. V2 on the way...

Regards,
Jim


___

Re: [Xen-devel] [PATCH] libxl: fix dom0 balloon logic

2015-03-24 Thread Stefano Stabellini
On Mon, 23 Mar 2015, Ian Campbell wrote:
 (just ccing the other tools maintainers, in particular Stefano who knows
 what this stuff is supposed to do...)
 
 On Fri, 2015-03-20 at 17:10 -0600, Jim Fehlig wrote:
  Recent testing on large memory systems revealed a bug in the Xen xl
  tool's freemem() function.  When autoballooning is enabled, freemem()
  is used to ensure enough memory is available to start a domain,
  ballooning dom0 if necessary.  When ballooning large amounts of memory
  from dom0, freemem() would exceed its self-imposed wait time and
  return an error.  Meanwhile, dom0 continued to balloon.  Starting the
  domain later, after sufficient memory was ballooned from dom0, would
  succeed.  The libvirt implementation in libxlDomainFreeMem() suffers
  the same bug since it is modeled after freemem().
  
  In the end, the best place to fix the bug on the Xen side was to
  slightly change the behavior of libxl_wait_for_memory_target().
  Instead of failing after caller-provided wait_sec, the function now
  blocks as long as dom0 memory ballooning is progressing.  It will return
  failure only when more memory is needed to reach the target and wait_sec
  have expired with no progress being made.  See xen.git commit fd3aa246.
  There was a dicussion on how this would affect other libxl apps like
  libvirt
  
  http://lists.xen.org/archives/html/xen-devel/2015-03/msg00739.html
  
  If libvirt containing this patch was build against a Xen containing
  the old libxl_wait_for_memory_target() behavior, libxlDomainFreeMem()
  will fail after 30 sec and domain creation will be terminated.
  Without this patch and with old libxl_wait_for_memory_target() behavior,
  libxlDomainFreeMem() does not succeed after 30 sec, but returns success
  anyway.  Domain creation continues resulting in all sorts of fun stuff
  like cpu soft lockups in the guest OS.  It was decided to properly fix
  libxl_wait_for_memory_target(), and if anything improve the default
  behavior of apps using the freemem reference impl in xl.
  
  xl was patched to accommodate the change in libxl_wait_for_memory_target()
  with xen.git commit 883b30a0.  This patch does the same in the libxl
  driver.  While at it, I changed the logic to essentially match
  freemem() in $xensrc/tools/libxl/xl_cmdimpl.c.  It was a bit cleaner
  IMO and will make it easier to spot future, potentially interesting
  divergences.
 
  Signed-off-by: Jim Fehlig jfeh...@suse.com
  ---
   src/libxl/libxl_domain.c | 57 
  
   1 file changed, 28 insertions(+), 29 deletions(-)
  
  diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c
  index 407a9bd..ff78133 100644
  --- a/src/libxl/libxl_domain.c
  +++ b/src/libxl/libxl_domain.c
  @@ -1121,38 +1121,41 @@ libxlDomainFreeMem(libxlDomainObjPrivatePtr priv, 
  libxl_domain_config *d_config)
   {
   uint32_t needed_mem;
   uint32_t free_mem;
  -size_t i;
  -int ret = -1;
  +int ret;
   int tries = 3;
   int wait_secs = 10;
   
  -if ((ret = libxl_domain_need_memory(priv-ctx, d_config-b_info,
  -needed_mem)) = 0) {
  -for (i = 0; i  tries; ++i) {
  -if ((ret = libxl_get_free_memory(priv-ctx, free_mem))  0)
  -break;
  +ret = libxl_domain_need_memory(priv-ctx, d_config-b_info,
  +   needed_mem);
  +if (ret  0)
  +goto error;
   
  -if (free_mem = needed_mem) {
  -ret = 0;
  -break;
  -}
  +do {
  +ret = libxl_get_free_memory(priv-ctx, free_mem);
  +if (ret  0)
  +goto error;
   
  -if ((ret = libxl_set_memory_target(priv-ctx, 0,
  -   free_mem - needed_mem,
  -   /* relative */ 1, 0))  0)
  -break;
  +if (free_mem = needed_mem)
  +return 0;
   
  -ret = libxl_wait_for_free_memory(priv-ctx, 0, needed_mem,
  - wait_secs);
  -if (ret == 0 || ret != ERROR_NOMEM)
  -break;
  +ret = libxl_set_memory_target(priv-ctx, 0,
  +  free_mem - needed_mem,
  +  /* relative */ 1, 0);
  +if (ret  0)
  +goto error;
   
  -if ((ret = libxl_wait_for_memory_target(priv-ctx, 0, 1))  0)
  -break;
  -}
  -}
  +ret = libxl_wait_for_free_memory(priv-ctx, 0, needed_mem,
  + wait_secs);
  +if (ret  0)
  +goto error;

Shouldn't this be a call to libxl_wait_for_memory_target instead?


  -return ret;
  +tries--;
  +} while (tries  0);
  +
  + error:
  +virReportSystemError(ret, %s,
  + _(Failed to 

Re: [Xen-devel] [PATCH] libxl: fix dom0 balloon logic

2015-03-23 Thread Ian Campbell
(just ccing the other tools maintainers, in particular Stefano who knows
what this stuff is supposed to do...)

On Fri, 2015-03-20 at 17:10 -0600, Jim Fehlig wrote:
 Recent testing on large memory systems revealed a bug in the Xen xl
 tool's freemem() function.  When autoballooning is enabled, freemem()
 is used to ensure enough memory is available to start a domain,
 ballooning dom0 if necessary.  When ballooning large amounts of memory
 from dom0, freemem() would exceed its self-imposed wait time and
 return an error.  Meanwhile, dom0 continued to balloon.  Starting the
 domain later, after sufficient memory was ballooned from dom0, would
 succeed.  The libvirt implementation in libxlDomainFreeMem() suffers
 the same bug since it is modeled after freemem().
 
 In the end, the best place to fix the bug on the Xen side was to
 slightly change the behavior of libxl_wait_for_memory_target().
 Instead of failing after caller-provided wait_sec, the function now
 blocks as long as dom0 memory ballooning is progressing.  It will return
 failure only when more memory is needed to reach the target and wait_sec
 have expired with no progress being made.  See xen.git commit fd3aa246.
 There was a dicussion on how this would affect other libxl apps like
 libvirt
 
 http://lists.xen.org/archives/html/xen-devel/2015-03/msg00739.html
 
 If libvirt containing this patch was build against a Xen containing
 the old libxl_wait_for_memory_target() behavior, libxlDomainFreeMem()
 will fail after 30 sec and domain creation will be terminated.
 Without this patch and with old libxl_wait_for_memory_target() behavior,
 libxlDomainFreeMem() does not succeed after 30 sec, but returns success
 anyway.  Domain creation continues resulting in all sorts of fun stuff
 like cpu soft lockups in the guest OS.  It was decided to properly fix
 libxl_wait_for_memory_target(), and if anything improve the default
 behavior of apps using the freemem reference impl in xl.
 
 xl was patched to accommodate the change in libxl_wait_for_memory_target()
 with xen.git commit 883b30a0.  This patch does the same in the libxl
 driver.  While at it, I changed the logic to essentially match
 freemem() in $xensrc/tools/libxl/xl_cmdimpl.c.  It was a bit cleaner
 IMO and will make it easier to spot future, potentially interesting
 divergences.
 
 Signed-off-by: Jim Fehlig jfeh...@suse.com
 ---
  src/libxl/libxl_domain.c | 57 
 
  1 file changed, 28 insertions(+), 29 deletions(-)
 
 diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c
 index 407a9bd..ff78133 100644
 --- a/src/libxl/libxl_domain.c
 +++ b/src/libxl/libxl_domain.c
 @@ -1121,38 +1121,41 @@ libxlDomainFreeMem(libxlDomainObjPrivatePtr priv, 
 libxl_domain_config *d_config)
  {
  uint32_t needed_mem;
  uint32_t free_mem;
 -size_t i;
 -int ret = -1;
 +int ret;
  int tries = 3;
  int wait_secs = 10;
  
 -if ((ret = libxl_domain_need_memory(priv-ctx, d_config-b_info,
 -needed_mem)) = 0) {
 -for (i = 0; i  tries; ++i) {
 -if ((ret = libxl_get_free_memory(priv-ctx, free_mem))  0)
 -break;
 +ret = libxl_domain_need_memory(priv-ctx, d_config-b_info,
 +   needed_mem);
 +if (ret  0)
 +goto error;
  
 -if (free_mem = needed_mem) {
 -ret = 0;
 -break;
 -}
 +do {
 +ret = libxl_get_free_memory(priv-ctx, free_mem);
 +if (ret  0)
 +goto error;
  
 -if ((ret = libxl_set_memory_target(priv-ctx, 0,
 -   free_mem - needed_mem,
 -   /* relative */ 1, 0))  0)
 -break;
 +if (free_mem = needed_mem)
 +return 0;
  
 -ret = libxl_wait_for_free_memory(priv-ctx, 0, needed_mem,
 - wait_secs);
 -if (ret == 0 || ret != ERROR_NOMEM)
 -break;
 +ret = libxl_set_memory_target(priv-ctx, 0,
 +  free_mem - needed_mem,
 +  /* relative */ 1, 0);
 +if (ret  0)
 +goto error;
  
 -if ((ret = libxl_wait_for_memory_target(priv-ctx, 0, 1))  0)
 -break;
 -}
 -}
 +ret = libxl_wait_for_free_memory(priv-ctx, 0, needed_mem,
 + wait_secs);
 +if (ret  0)
 +goto error;
  
 -return ret;
 +tries--;
 +} while (tries  0);
 +
 + error:
 +virReportSystemError(ret, %s,
 + _(Failed to balloon domain0 memory));
 +return -1;
  }
  
  static void
 @@ -1271,12 +1274,8 @@ libxlDomainStart(libxlDriverPrivatePtr driver, 
 virDomainObjPtr vm,
 priv-ctx, d_config)  0)
  

[Xen-devel] [PATCH] libxl: fix dom0 balloon logic

2015-03-20 Thread Jim Fehlig
Recent testing on large memory systems revealed a bug in the Xen xl
tool's freemem() function.  When autoballooning is enabled, freemem()
is used to ensure enough memory is available to start a domain,
ballooning dom0 if necessary.  When ballooning large amounts of memory
from dom0, freemem() would exceed its self-imposed wait time and
return an error.  Meanwhile, dom0 continued to balloon.  Starting the
domain later, after sufficient memory was ballooned from dom0, would
succeed.  The libvirt implementation in libxlDomainFreeMem() suffers
the same bug since it is modeled after freemem().

In the end, the best place to fix the bug on the Xen side was to
slightly change the behavior of libxl_wait_for_memory_target().
Instead of failing after caller-provided wait_sec, the function now
blocks as long as dom0 memory ballooning is progressing.  It will return
failure only when more memory is needed to reach the target and wait_sec
have expired with no progress being made.  See xen.git commit fd3aa246.
There was a dicussion on how this would affect other libxl apps like
libvirt

http://lists.xen.org/archives/html/xen-devel/2015-03/msg00739.html

If libvirt containing this patch was build against a Xen containing
the old libxl_wait_for_memory_target() behavior, libxlDomainFreeMem()
will fail after 30 sec and domain creation will be terminated.
Without this patch and with old libxl_wait_for_memory_target() behavior,
libxlDomainFreeMem() does not succeed after 30 sec, but returns success
anyway.  Domain creation continues resulting in all sorts of fun stuff
like cpu soft lockups in the guest OS.  It was decided to properly fix
libxl_wait_for_memory_target(), and if anything improve the default
behavior of apps using the freemem reference impl in xl.

xl was patched to accommodate the change in libxl_wait_for_memory_target()
with xen.git commit 883b30a0.  This patch does the same in the libxl
driver.  While at it, I changed the logic to essentially match
freemem() in $xensrc/tools/libxl/xl_cmdimpl.c.  It was a bit cleaner
IMO and will make it easier to spot future, potentially interesting
divergences.

Signed-off-by: Jim Fehlig jfeh...@suse.com
---
 src/libxl/libxl_domain.c | 57 
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c
index 407a9bd..ff78133 100644
--- a/src/libxl/libxl_domain.c
+++ b/src/libxl/libxl_domain.c
@@ -1121,38 +1121,41 @@ libxlDomainFreeMem(libxlDomainObjPrivatePtr priv, 
libxl_domain_config *d_config)
 {
 uint32_t needed_mem;
 uint32_t free_mem;
-size_t i;
-int ret = -1;
+int ret;
 int tries = 3;
 int wait_secs = 10;
 
-if ((ret = libxl_domain_need_memory(priv-ctx, d_config-b_info,
-needed_mem)) = 0) {
-for (i = 0; i  tries; ++i) {
-if ((ret = libxl_get_free_memory(priv-ctx, free_mem))  0)
-break;
+ret = libxl_domain_need_memory(priv-ctx, d_config-b_info,
+   needed_mem);
+if (ret  0)
+goto error;
 
-if (free_mem = needed_mem) {
-ret = 0;
-break;
-}
+do {
+ret = libxl_get_free_memory(priv-ctx, free_mem);
+if (ret  0)
+goto error;
 
-if ((ret = libxl_set_memory_target(priv-ctx, 0,
-   free_mem - needed_mem,
-   /* relative */ 1, 0))  0)
-break;
+if (free_mem = needed_mem)
+return 0;
 
-ret = libxl_wait_for_free_memory(priv-ctx, 0, needed_mem,
- wait_secs);
-if (ret == 0 || ret != ERROR_NOMEM)
-break;
+ret = libxl_set_memory_target(priv-ctx, 0,
+  free_mem - needed_mem,
+  /* relative */ 1, 0);
+if (ret  0)
+goto error;
 
-if ((ret = libxl_wait_for_memory_target(priv-ctx, 0, 1))  0)
-break;
-}
-}
+ret = libxl_wait_for_free_memory(priv-ctx, 0, needed_mem,
+ wait_secs);
+if (ret  0)
+goto error;
 
-return ret;
+tries--;
+} while (tries  0);
+
+ error:
+virReportSystemError(ret, %s,
+ _(Failed to balloon domain0 memory));
+return -1;
 }
 
 static void
@@ -1271,12 +1274,8 @@ libxlDomainStart(libxlDriverPrivatePtr driver, 
virDomainObjPtr vm,
priv-ctx, d_config)  0)
 goto endjob;
 
-if (cfg-autoballoon  libxlDomainFreeMem(priv, d_config)  0) {
-virReportError(VIR_ERR_INTERNAL_ERROR,
-   _(libxenlight failed to get free memory for domain 
'%s'),
-   d_config.c_info.name);
+if