Re: [Qemu-devel] [PATCHv2 for-2.10 2/5] pseries: Implement HPT resizing

2017-03-20 Thread David Gibson
On Wed, Mar 15, 2017 at 12:44:18PM +0530, Bharata B Rao wrote:
> On Tue, Mar 14, 2017 at 11:04 AM, David Gibson 
> wrote:
> 
> > This patch implements hypercalls allowing a PAPR guest to resize its own
> > hash page table.  This will eventually allow for more flexible memory
> > hotplug.
> >
> > The implementation is partially asynchronous, handled in a special thread
> > running the hpt_prepare_thread() function.  The state of a pending resize
> > is stored in SPAPR_MACHINE->pending_hpt.
> >
> > The H_RESIZE_HPT_PREPARE hypercall will kick off creation of a new HPT, or,
> > if one is already in progress, monitor it for completion.  If there is an
> > existing HPT resize in progress that doesn't match the size specified in
> > the call, it will cancel it, replacing it with a new one matching the
> > given size.
> >
> > The H_RESIZE_HPT_COMMIT completes transition to a resized HPT, and can only
> > be called successfully once H_RESIZE_HPT_PREPARE has successfully
> > completed initialization of a new HPT.  The guest must ensure that there
> > are no concurrent accesses to the existing HPT while this is called (this
> > effectively means stop_machine() for Linux guests).
> >
> > For now H_RESIZE_HPT_COMMIT goes through the whole old HPT, rehashing each
> > HPTE into the new HPT.  This can have quite high latency, but it seems to
> > be of the order of typical migration downtime latencies for HPTs of size
> > up to ~2GiB (which would be used in a 256GiB guest).
> >
> > In future we probably want to move more of the rehashing to the "prepare"
> > phase, by having H_ENTER and other hcalls update both current and
> > pending HPTs.  That's a project for another day, but should be possible
> > without any changes to the guest interface.
> >
> > Signed-off-by: David Gibson 
> > ---
> >  hw/ppc/spapr.c  |   4 +-
> >  hw/ppc/spapr_hcall.c| 306 ++
> > +-
> >  include/hw/ppc/spapr.h  |   6 +
> >  target/ppc/mmu-hash64.h |   4 +
> >  4 files changed, 314 insertions(+), 6 deletions(-)
> >
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index 558109c..83db110 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -94,8 +94,6 @@
> >
> >  #define PHANDLE_XICP0x
> >
> > -#define HTAB_SIZE(spapr)(1ULL << ((spapr)->htab_shift))
> > -
> >  static int try_create_xics(sPAPRMachineState *spapr, const char
> > *type_ics,
> > const char *type_icp, int nr_servers,
> > int nr_irqs, Error **errp)
> > @@ -1169,7 +1167,7 @@ static void spapr_store_hpte(PPCVirtualHypervisor
> > *vhyp, hwaddr ptex,
> >  }
> >  }
> >
> > -static int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
> > +int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
> >  {
> >  int shift;
> >
> > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> > index 9f88960..cdafc3f 100644
> > --- a/hw/ppc/spapr_hcall.c
> > +++ b/hw/ppc/spapr_hcall.c
> > @@ -3,6 +3,7 @@
> >  #include "sysemu/hw_accel.h"
> >  #include "sysemu/sysemu.h"
> >  #include "qemu/log.h"
> > +#include "qemu/error-report.h"
> >  #include "cpu.h"
> >  #include "exec/exec-all.h"
> >  #include "helper_regs.h"
> > @@ -352,20 +353,286 @@ static target_ulong h_read(PowerPCCPU *cpu,
> > sPAPRMachineState *spapr,
> >  return H_SUCCESS;
> >  }
> >
> > +struct sPAPRPendingHPT {
> > +/* These fields are read-only after initialization */
> > +int shift;
> > +QemuThread thread;
> > +
> > +/* These fields are protected by the BQL */
> > +bool complete;
> > +
> > +/* These fields are private to the preparation thread if
> > + * !complete, otherwise protected by the BQL */
> > +int ret;
> > +void *hpt;
> > +};
> > +
> > +static void free_pending_hpt(sPAPRPendingHPT *pending)
> > +{
> > +if (pending->hpt) {
> > +qemu_vfree(pending->hpt);
> > +}
> > +
> > +g_free(pending);
> > +}
> > +
> > +static void *hpt_prepare_thread(void *opaque)
> > +{
> > +sPAPRPendingHPT *pending = opaque;
> > +size_t size = 1ULL << pending->shift;
> > +
> > +pending->hpt = qemu_memalign(size, size);
> > +if (pending->hpt) {
> > +memset(pending->hpt, 0, size);
> > +pending->ret = H_SUCCESS;
> > +} else {
> > +pending->ret = H_NO_MEM;
> > +}
> > +
> > +qemu_mutex_lock_iothread();
> > +
> > +if (SPAPR_MACHINE(qdev_get_machine())->pending_hpt == pending) {
> > +/* Ready to go */
> > +pending->complete = true;
> > +} else {
> > +/* We've been cancelled, clean ourselves up */
> > +free_pending_hpt(pending);
> > +}
> > +
> > +qemu_mutex_unlock_iothread();
> > +return NULL;
> > +}
> > +
> > +/* Must be called with BQL held */
> > +static void cancel_hpt_prepare(sPAPRMachineState *spapr)
> > +{
> > +sPAPRPendingHPT *pending = spapr->pending_hpt;
> > +
> > +/* Let the 

Re: [Qemu-devel] [PATCHv2 for-2.10 2/5] pseries: Implement HPT resizing

2017-03-15 Thread Bharata B Rao
On Tue, Mar 14, 2017 at 11:04 AM, David Gibson 
wrote:

> This patch implements hypercalls allowing a PAPR guest to resize its own
> hash page table.  This will eventually allow for more flexible memory
> hotplug.
>
> The implementation is partially asynchronous, handled in a special thread
> running the hpt_prepare_thread() function.  The state of a pending resize
> is stored in SPAPR_MACHINE->pending_hpt.
>
> The H_RESIZE_HPT_PREPARE hypercall will kick off creation of a new HPT, or,
> if one is already in progress, monitor it for completion.  If there is an
> existing HPT resize in progress that doesn't match the size specified in
> the call, it will cancel it, replacing it with a new one matching the
> given size.
>
> The H_RESIZE_HPT_COMMIT completes transition to a resized HPT, and can only
> be called successfully once H_RESIZE_HPT_PREPARE has successfully
> completed initialization of a new HPT.  The guest must ensure that there
> are no concurrent accesses to the existing HPT while this is called (this
> effectively means stop_machine() for Linux guests).
>
> For now H_RESIZE_HPT_COMMIT goes through the whole old HPT, rehashing each
> HPTE into the new HPT.  This can have quite high latency, but it seems to
> be of the order of typical migration downtime latencies for HPTs of size
> up to ~2GiB (which would be used in a 256GiB guest).
>
> In future we probably want to move more of the rehashing to the "prepare"
> phase, by having H_ENTER and other hcalls update both current and
> pending HPTs.  That's a project for another day, but should be possible
> without any changes to the guest interface.
>
> Signed-off-by: David Gibson 
> ---
>  hw/ppc/spapr.c  |   4 +-
>  hw/ppc/spapr_hcall.c| 306 ++
> +-
>  include/hw/ppc/spapr.h  |   6 +
>  target/ppc/mmu-hash64.h |   4 +
>  4 files changed, 314 insertions(+), 6 deletions(-)
>
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 558109c..83db110 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -94,8 +94,6 @@
>
>  #define PHANDLE_XICP0x
>
> -#define HTAB_SIZE(spapr)(1ULL << ((spapr)->htab_shift))
> -
>  static int try_create_xics(sPAPRMachineState *spapr, const char
> *type_ics,
> const char *type_icp, int nr_servers,
> int nr_irqs, Error **errp)
> @@ -1169,7 +1167,7 @@ static void spapr_store_hpte(PPCVirtualHypervisor
> *vhyp, hwaddr ptex,
>  }
>  }
>
> -static int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
> +int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
>  {
>  int shift;
>
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 9f88960..cdafc3f 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -3,6 +3,7 @@
>  #include "sysemu/hw_accel.h"
>  #include "sysemu/sysemu.h"
>  #include "qemu/log.h"
> +#include "qemu/error-report.h"
>  #include "cpu.h"
>  #include "exec/exec-all.h"
>  #include "helper_regs.h"
> @@ -352,20 +353,286 @@ static target_ulong h_read(PowerPCCPU *cpu,
> sPAPRMachineState *spapr,
>  return H_SUCCESS;
>  }
>
> +struct sPAPRPendingHPT {
> +/* These fields are read-only after initialization */
> +int shift;
> +QemuThread thread;
> +
> +/* These fields are protected by the BQL */
> +bool complete;
> +
> +/* These fields are private to the preparation thread if
> + * !complete, otherwise protected by the BQL */
> +int ret;
> +void *hpt;
> +};
> +
> +static void free_pending_hpt(sPAPRPendingHPT *pending)
> +{
> +if (pending->hpt) {
> +qemu_vfree(pending->hpt);
> +}
> +
> +g_free(pending);
> +}
> +
> +static void *hpt_prepare_thread(void *opaque)
> +{
> +sPAPRPendingHPT *pending = opaque;
> +size_t size = 1ULL << pending->shift;
> +
> +pending->hpt = qemu_memalign(size, size);
> +if (pending->hpt) {
> +memset(pending->hpt, 0, size);
> +pending->ret = H_SUCCESS;
> +} else {
> +pending->ret = H_NO_MEM;
> +}
> +
> +qemu_mutex_lock_iothread();
> +
> +if (SPAPR_MACHINE(qdev_get_machine())->pending_hpt == pending) {
> +/* Ready to go */
> +pending->complete = true;
> +} else {
> +/* We've been cancelled, clean ourselves up */
> +free_pending_hpt(pending);
> +}
> +
> +qemu_mutex_unlock_iothread();
> +return NULL;
> +}
> +
> +/* Must be called with BQL held */
> +static void cancel_hpt_prepare(sPAPRMachineState *spapr)
> +{
> +sPAPRPendingHPT *pending = spapr->pending_hpt;
> +
> +/* Let the thread know it's cancelled */
> +spapr->pending_hpt = NULL;
> +
> +if (!pending) {
> +/* Nothing to do */
> +return;
> +}
> +
> +if (!pending->complete) {
> +/* thread will clean itself up */
> +return;
> +}
> +
> +free_pending_hpt(pending);
> +}
> +
>  static target_ulong