ping
On Fri, Nov 11, 2022 at 02:54:38PM +0530, Shaleen Bathla wrote:
> Problem:
> libvirt has a 5 second timeout (generally) for hotplug/unplug
> operations which can time out due to heavy load in guest.
> 
> vcpu hotunplug occurs one vcpu at a time.
> But, if we perform hotplug-unplug repeatedly,
> Case 1: qemu sends multiple timedout vcpu unplug notification before
> libvirt processed first one.
> Case 2: when attempting a hotplug, qemu finishes unplug of another cpu
> 
> libvirt waits for an async event notification from qemu regarding
> successful vcpu delete.
> qemu deletes its vcpu, vcpuinfo and sends notification to libvirt.
> libvirt handles vcpu delete notification, and updates vcpuinfo
> received from qemu in qemuDomainRefreshVcpuInfo().
> 
> qemu's vcpuinfo during refresh will not contain other deleted, sent vcpu
> qemu's vcpuinfo will overwrite libvirt's vcpuinfo of the other pending
> timedout vcpu(s) which qemu has deleted and notified libvirt.
> The overwrite resets other timedout vcpu's alias to NULL and tid to 0.
> 
> The error is then seen when validating tid of vcpus.
> Example error log:
> "internal error: qemu didn't report thread id for vcpu 'XX'"
> 
> Solution:
> Clear vcpu alias of only the vcpu that hotplug API calls.
> Other vcpu-alias won't get reset when vcpuinfo refresh occurs.
> Validation is also done for corresponding vcpus only, not all.
> 
> Co-authored-by: Partha Satapathy <partha.satapa...@oracle.com>
> Signed-off-by: Shaleen Bathla <shaleen.bat...@oracle.com>
> ---
>  src/qemu/qemu_domain.c  |  6 ++++--
>  src/qemu/qemu_hotplug.c | 39 +++++++++++++++++++++++++++++++--------
>  2 files changed, 35 insertions(+), 10 deletions(-)
> 
> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
> index 3435da5bdc4c..6ae842d0e32f 100644
> --- a/src/qemu/qemu_domain.c
> +++ b/src/qemu/qemu_domain.c
> @@ -9773,8 +9773,10 @@ qemuDomainRefreshVcpuInfo(virDomainObj *vm,
>          vcpupriv->vcpus = info[i].vcpus;
>          VIR_FREE(vcpupriv->type);
>          vcpupriv->type = g_steal_pointer(&info[i].type);
> -        VIR_FREE(vcpupriv->alias);
> -        vcpupriv->alias = g_steal_pointer(&info[i].alias);
> +        if (info[i].alias) {
> +            VIR_FREE(vcpupriv->alias);
> +            vcpupriv->alias = g_steal_pointer(&info[i].alias);
> +        }
>          virJSONValueFree(vcpupriv->props);
>          vcpupriv->props = g_steal_pointer(&info[i].props);
>          vcpupriv->enable_id = info[i].id;
> diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
> index da92ced2f444..f11b90a220ac 100644
> --- a/src/qemu/qemu_hotplug.c
> +++ b/src/qemu/qemu_hotplug.c
> @@ -6090,6 +6090,8 @@ qemuDomainRemoveVcpu(virDomainObj *vm,
>      unsigned int nvcpus = vcpupriv->vcpus;
>      virErrorPtr save_error = NULL;
>      size_t i;
> +    bool hasVcpuPids = qemuDomainHasVcpuPids(vm);
> +    bool rollback = false;
> 
>      if (qemuDomainRefreshVcpuInfo(vm, VIR_ASYNC_JOB_NONE, false) < 0)
>          return -1;
> @@ -6097,14 +6099,26 @@ qemuDomainRemoveVcpu(virDomainObj *vm,
>      /* validation requires us to set the expected state prior to calling it 
> */
>      for (i = vcpu; i < vcpu + nvcpus; i++) {
>          vcpuinfo = virDomainDefGetVcpu(vm->def, i);
> +        vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);
> +
>          vcpuinfo->online = false;
> +        VIR_FREE(vcpupriv->alias); /* clear vcpu alias of only this vcpu */
> +
> +        if (vcpupriv->tid != 0) {
> +            if (hasVcpuPids)
> +                virReportError(VIR_ERR_INTERNAL_ERROR,
> +                               _("qemu reported thread id for inactive vcpu 
> '%zu'"), i);
> +
> +            rollback = true;
> +            break;
> +        }
>      }
> 
> -    if (qemuDomainValidateVcpuInfo(vm) < 0) {
> +    if (rollback) {
>          /* rollback vcpu count if the setting has failed */
>          virDomainAuditVcpu(vm, oldvcpus, oldvcpus - nvcpus, "update", false);
> 
> -        for (i = vcpu; i < vcpu + nvcpus; i++) {
> +        for (; i >= vcpu; i--) {
>              vcpuinfo = virDomainDefGetVcpu(vm->def, i);
>              vcpuinfo->online = true;
>          }
> @@ -6141,6 +6155,9 @@ qemuDomainRemoveVcpuAlias(virDomainObj *vm,
>              return;
>          }
>      }
> +
> +    VIR_DEBUG("%s not found in vcpulist of domain %s ",
> +              alias, vm->def->name);
>  }
> 
> 
> @@ -6209,6 +6226,7 @@ qemuDomainHotplugAddVcpu(virQEMUDriver *driver,
>      int rc;
>      int oldvcpus = virDomainDefGetVcpus(vm->def);
>      size_t i;
> +    bool hasVcpuPids = qemuDomainHasVcpuPids(vm);
> 
>      if (!qemuDomainSupportsNewVcpuHotplug(vm)) {
>          virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s",
> @@ -6245,14 +6263,19 @@ qemuDomainHotplugAddVcpu(virQEMUDriver *driver,
> 
>          vcpuinfo->online = true;
> 
> -        if (vcpupriv->tid > 0 &&
> -            qemuProcessSetupVcpu(vm, i, true) < 0)
> -            return -1;
> +        if (vcpupriv->tid > 0) {
> +            if (qemuProcessSetupVcpu(vm, i, true) < 0) {
> +                return -1;
> +            }
> +        } else {
> +            if (hasVcpuPids) {
> +                virReportError(VIR_ERR_INTERNAL_ERROR,
> +                               _("qemu didn't report thread id for vcpu 
> '%zu'"), i);
> +                return -1;
> +            }
> +        }
>      }
> 
> -    if (qemuDomainValidateVcpuInfo(vm) < 0)
> -        return -1;
> -
>      qemuDomainVcpuPersistOrder(vm->def);
> 
>      if (virDomainObjSave(vm, driver->xmlopt, cfg->stateDir) < 0)
> --
> 2.31.1
> 
> V1 Patch :
> https://listman.redhat.com/archives/libvir-list/2022-November/235470.html
> 
> V1 Patch Review comments :
> https://listman.redhat.com/archives/libvir-list/2022-November/235534.html
> 
> Changes in V2 since V1:
> - Patch addresses review comments from Peter Krempa.
> - Validation is done for per cpu hotplug entity instead of all

Reply via email to