Re: [Qemu-devel] [PATCHv2 for-2.10 2/5] pseries: Implement HPT resizing

2017-03-20 Thread David Gibson
On Wed, Mar 15, 2017 at 12:44:18PM +0530, Bharata B Rao wrote:
> On Tue, Mar 14, 2017 at 11:04 AM, David Gibson 
> wrote:
> 
> > This patch implements hypercalls allowing a PAPR guest to resize its own
> > hash page table.  This will eventually allow for more flexible memory
> > hotplug.
> >
> > The implementation is partially asynchronous, handled in a special thread
> > running the hpt_prepare_thread() function.  The state of a pending resize
> > is stored in SPAPR_MACHINE->pending_hpt.
> >
> > The H_RESIZE_HPT_PREPARE hypercall will kick off creation of a new HPT, or,
> > if one is already in progress, monitor it for completion.  If there is an
> > existing HPT resize in progress that doesn't match the size specified in
> > the call, it will cancel it, replacing it with a new one matching the
> > given size.
> >
> > The H_RESIZE_HPT_COMMIT completes transition to a resized HPT, and can only
> > be called successfully once H_RESIZE_HPT_PREPARE has successfully
> > completed initialization of a new HPT.  The guest must ensure that there
> > are no concurrent accesses to the existing HPT while this is called (this
> > effectively means stop_machine() for Linux guests).
> >
> > For now H_RESIZE_HPT_COMMIT goes through the whole old HPT, rehashing each
> > HPTE into the new HPT.  This can have quite high latency, but it seems to
> > be of the order of typical migration downtime latencies for HPTs of size
> > up to ~2GiB (which would be used in a 256GiB guest).
> >
> > In future we probably want to move more of the rehashing to the "prepare"
> > phase, by having H_ENTER and other hcalls update both current and
> > pending HPTs.  That's a project for another day, but should be possible
> > without any changes to the guest interface.
> >
> > Signed-off-by: David Gibson 
> > ---
> >  hw/ppc/spapr.c  |   4 +-
> >  hw/ppc/spapr_hcall.c| 306 ++
> > +-
> >  include/hw/ppc/spapr.h  |   6 +
> >  target/ppc/mmu-hash64.h |   4 +
> >  4 files changed, 314 insertions(+), 6 deletions(-)
> >
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index 558109c..83db110 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -94,8 +94,6 @@
> >
> >  #define PHANDLE_XICP0x
> >
> > -#define HTAB_SIZE(spapr)(1ULL << ((spapr)->htab_shift))
> > -
> >  static int try_create_xics(sPAPRMachineState *spapr, const char
> > *type_ics,
> > const char *type_icp, int nr_servers,
> > int nr_irqs, Error **errp)
> > @@ -1169,7 +1167,7 @@ static void spapr_store_hpte(PPCVirtualHypervisor
> > *vhyp, hwaddr ptex,
> >  }
> >  }
> >
> > -static int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
> > +int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
> >  {
> >  int shift;
> >
> > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> > index 9f88960..cdafc3f 100644
> > --- a/hw/ppc/spapr_hcall.c
> > +++ b/hw/ppc/spapr_hcall.c
> > @@ -3,6 +3,7 @@
> >  #include "sysemu/hw_accel.h"
> >  #include "sysemu/sysemu.h"
> >  #include "qemu/log.h"
> > +#include "qemu/error-report.h"
> >  #include "cpu.h"
> >  #include "exec/exec-all.h"
> >  #include "helper_regs.h"
> > @@ -352,20 +353,286 @@ static target_ulong h_read(PowerPCCPU *cpu,
> > sPAPRMachineState *spapr,
> >  return H_SUCCESS;
> >  }
> >
> > +struct sPAPRPendingHPT {
> > +/* These fields are read-only after initialization */
> > +int shift;
> > +QemuThread thread;
> > +
> > +/* These fields are protected by the BQL */
> > +bool complete;
> > +
> > +/* These fields are private to the preparation thread if
> > + * !complete, otherwise protected by the BQL */
> > +int ret;
> > +void *hpt;
> > +};
> > +
> > +static void free_pending_hpt(sPAPRPendingHPT *pending)
> > +{
> > +if (pending->hpt) {
> > +qemu_vfree(pending->hpt);
> > +}
> > +
> > +g_free(pending);
> > +}
> > +
> > +static void *hpt_prepare_thread(void *opaque)
> > +{
> > +sPAPRPendingHPT *pending = opaque;
> > +size_t size = 1ULL << pending->shift;
> > +
> > +pending->hpt = qemu_memalign(size, size);
> > +if (pending->hpt) {
> > +memset(pending->hpt, 0, size);
> > +pending->ret = H_SUCCESS;
> > +} else {
> > +pending->ret = H_NO_MEM;
> > +}
> > +
> > +qemu_mutex_lock_iothread();
> > +
> > +if (SPAPR_MACHINE(qdev_get_machine())->pending_hpt == pending) {
> > +/* Ready to go */
> > +pending->complete = true;
> > +} else {
> > +/* We've been cancelled, clean ourselves up */
> > +free_pending_hpt(pending);
> > +}
> > +
> > +qemu_mutex_unlock_iothread();
> > +return NULL;
> > +}
> > +
> > +/* Must be called with BQL held */
> > +static void cancel_hpt_prepare(sPAPRMachineState *spapr)
> > +{
> > +sPAPRPendingHPT *pending = spapr->pending_hpt;
> > +
> > +/* Let the 

Re: [Qemu-devel] [PATCHv2 for-2.10 2/5] pseries: Implement HPT resizing

2017-03-15 Thread Bharata B Rao
On Tue, Mar 14, 2017 at 11:04 AM, David Gibson 
wrote:

> This patch implements hypercalls allowing a PAPR guest to resize its own
> hash page table.  This will eventually allow for more flexible memory
> hotplug.
>
> The implementation is partially asynchronous, handled in a special thread
> running the hpt_prepare_thread() function.  The state of a pending resize
> is stored in SPAPR_MACHINE->pending_hpt.
>
> The H_RESIZE_HPT_PREPARE hypercall will kick off creation of a new HPT, or,
> if one is already in progress, monitor it for completion.  If there is an
> existing HPT resize in progress that doesn't match the size specified in
> the call, it will cancel it, replacing it with a new one matching the
> given size.
>
> The H_RESIZE_HPT_COMMIT completes transition to a resized HPT, and can only
> be called successfully once H_RESIZE_HPT_PREPARE has successfully
> completed initialization of a new HPT.  The guest must ensure that there
> are no concurrent accesses to the existing HPT while this is called (this
> effectively means stop_machine() for Linux guests).
>
> For now H_RESIZE_HPT_COMMIT goes through the whole old HPT, rehashing each
> HPTE into the new HPT.  This can have quite high latency, but it seems to
> be of the order of typical migration downtime latencies for HPTs of size
> up to ~2GiB (which would be used in a 256GiB guest).
>
> In future we probably want to move more of the rehashing to the "prepare"
> phase, by having H_ENTER and other hcalls update both current and
> pending HPTs.  That's a project for another day, but should be possible
> without any changes to the guest interface.
>
> Signed-off-by: David Gibson 
> ---
>  hw/ppc/spapr.c  |   4 +-
>  hw/ppc/spapr_hcall.c| 306 ++
> +-
>  include/hw/ppc/spapr.h  |   6 +
>  target/ppc/mmu-hash64.h |   4 +
>  4 files changed, 314 insertions(+), 6 deletions(-)
>
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 558109c..83db110 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -94,8 +94,6 @@
>
>  #define PHANDLE_XICP0x
>
> -#define HTAB_SIZE(spapr)(1ULL << ((spapr)->htab_shift))
> -
>  static int try_create_xics(sPAPRMachineState *spapr, const char
> *type_ics,
> const char *type_icp, int nr_servers,
> int nr_irqs, Error **errp)
> @@ -1169,7 +1167,7 @@ static void spapr_store_hpte(PPCVirtualHypervisor
> *vhyp, hwaddr ptex,
>  }
>  }
>
> -static int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
> +int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
>  {
>  int shift;
>
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 9f88960..cdafc3f 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -3,6 +3,7 @@
>  #include "sysemu/hw_accel.h"
>  #include "sysemu/sysemu.h"
>  #include "qemu/log.h"
> +#include "qemu/error-report.h"
>  #include "cpu.h"
>  #include "exec/exec-all.h"
>  #include "helper_regs.h"
> @@ -352,20 +353,286 @@ static target_ulong h_read(PowerPCCPU *cpu,
> sPAPRMachineState *spapr,
>  return H_SUCCESS;
>  }
>
> +struct sPAPRPendingHPT {
> +/* These fields are read-only after initialization */
> +int shift;
> +QemuThread thread;
> +
> +/* These fields are protected by the BQL */
> +bool complete;
> +
> +/* These fields are private to the preparation thread if
> + * !complete, otherwise protected by the BQL */
> +int ret;
> +void *hpt;
> +};
> +
> +static void free_pending_hpt(sPAPRPendingHPT *pending)
> +{
> +if (pending->hpt) {
> +qemu_vfree(pending->hpt);
> +}
> +
> +g_free(pending);
> +}
> +
> +static void *hpt_prepare_thread(void *opaque)
> +{
> +sPAPRPendingHPT *pending = opaque;
> +size_t size = 1ULL << pending->shift;
> +
> +pending->hpt = qemu_memalign(size, size);
> +if (pending->hpt) {
> +memset(pending->hpt, 0, size);
> +pending->ret = H_SUCCESS;
> +} else {
> +pending->ret = H_NO_MEM;
> +}
> +
> +qemu_mutex_lock_iothread();
> +
> +if (SPAPR_MACHINE(qdev_get_machine())->pending_hpt == pending) {
> +/* Ready to go */
> +pending->complete = true;
> +} else {
> +/* We've been cancelled, clean ourselves up */
> +free_pending_hpt(pending);
> +}
> +
> +qemu_mutex_unlock_iothread();
> +return NULL;
> +}
> +
> +/* Must be called with BQL held */
> +static void cancel_hpt_prepare(sPAPRMachineState *spapr)
> +{
> +sPAPRPendingHPT *pending = spapr->pending_hpt;
> +
> +/* Let the thread know it's cancelled */
> +spapr->pending_hpt = NULL;
> +
> +if (!pending) {
> +/* Nothing to do */
> +return;
> +}
> +
> +if (!pending->complete) {
> +/* thread will clean itself up */
> +return;
> +}
> +
> +free_pending_hpt(pending);
> +}
> +
>  static target_ulong 

[Qemu-devel] [PATCHv2 for-2.10 2/5] pseries: Implement HPT resizing

2017-03-13 Thread David Gibson
This patch implements hypercalls allowing a PAPR guest to resize its own
hash page table.  This will eventually allow for more flexible memory
hotplug.

The implementation is partially asynchronous, handled in a special thread
running the hpt_prepare_thread() function.  The state of a pending resize
is stored in SPAPR_MACHINE->pending_hpt.

The H_RESIZE_HPT_PREPARE hypercall will kick off creation of a new HPT, or,
if one is already in progress, monitor it for completion.  If there is an
existing HPT resize in progress that doesn't match the size specified in
the call, it will cancel it, replacing it with a new one matching the
given size.

The H_RESIZE_HPT_COMMIT completes transition to a resized HPT, and can only
be called successfully once H_RESIZE_HPT_PREPARE has successfully
completed initialization of a new HPT.  The guest must ensure that there
are no concurrent accesses to the existing HPT while this is called (this
effectively means stop_machine() for Linux guests).

For now H_RESIZE_HPT_COMMIT goes through the whole old HPT, rehashing each
HPTE into the new HPT.  This can have quite high latency, but it seems to
be of the order of typical migration downtime latencies for HPTs of size
up to ~2GiB (which would be used in a 256GiB guest).

In future we probably want to move more of the rehashing to the "prepare"
phase, by having H_ENTER and other hcalls update both current and
pending HPTs.  That's a project for another day, but should be possible
without any changes to the guest interface.

Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c  |   4 +-
 hw/ppc/spapr_hcall.c| 306 +++-
 include/hw/ppc/spapr.h  |   6 +
 target/ppc/mmu-hash64.h |   4 +
 4 files changed, 314 insertions(+), 6 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 558109c..83db110 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -94,8 +94,6 @@
 
 #define PHANDLE_XICP0x
 
-#define HTAB_SIZE(spapr)(1ULL << ((spapr)->htab_shift))
-
 static int try_create_xics(sPAPRMachineState *spapr, const char *type_ics,
const char *type_icp, int nr_servers,
int nr_irqs, Error **errp)
@@ -1169,7 +1167,7 @@ static void spapr_store_hpte(PPCVirtualHypervisor *vhyp, 
hwaddr ptex,
 }
 }
 
-static int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
+int spapr_hpt_shift_for_ramsize(uint64_t ramsize)
 {
 int shift;
 
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 9f88960..cdafc3f 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -3,6 +3,7 @@
 #include "sysemu/hw_accel.h"
 #include "sysemu/sysemu.h"
 #include "qemu/log.h"
+#include "qemu/error-report.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "helper_regs.h"
@@ -352,20 +353,286 @@ static target_ulong h_read(PowerPCCPU *cpu, 
sPAPRMachineState *spapr,
 return H_SUCCESS;
 }
 
+struct sPAPRPendingHPT {
+/* These fields are read-only after initialization */
+int shift;
+QemuThread thread;
+
+/* These fields are protected by the BQL */
+bool complete;
+
+/* These fields are private to the preparation thread if
+ * !complete, otherwise protected by the BQL */
+int ret;
+void *hpt;
+};
+
+static void free_pending_hpt(sPAPRPendingHPT *pending)
+{
+if (pending->hpt) {
+qemu_vfree(pending->hpt);
+}
+
+g_free(pending);
+}
+
+static void *hpt_prepare_thread(void *opaque)
+{
+sPAPRPendingHPT *pending = opaque;
+size_t size = 1ULL << pending->shift;
+
+pending->hpt = qemu_memalign(size, size);
+if (pending->hpt) {
+memset(pending->hpt, 0, size);
+pending->ret = H_SUCCESS;
+} else {
+pending->ret = H_NO_MEM;
+}
+
+qemu_mutex_lock_iothread();
+
+if (SPAPR_MACHINE(qdev_get_machine())->pending_hpt == pending) {
+/* Ready to go */
+pending->complete = true;
+} else {
+/* We've been cancelled, clean ourselves up */
+free_pending_hpt(pending);
+}
+
+qemu_mutex_unlock_iothread();
+return NULL;
+}
+
+/* Must be called with BQL held */
+static void cancel_hpt_prepare(sPAPRMachineState *spapr)
+{
+sPAPRPendingHPT *pending = spapr->pending_hpt;
+
+/* Let the thread know it's cancelled */
+spapr->pending_hpt = NULL;
+
+if (!pending) {
+/* Nothing to do */
+return;
+}
+
+if (!pending->complete) {
+/* thread will clean itself up */
+return;
+}
+
+free_pending_hpt(pending);
+}
+
 static target_ulong h_resize_hpt_prepare(PowerPCCPU *cpu,
  sPAPRMachineState *spapr,
  target_ulong opcode,
  target_ulong *args)
 {
 target_ulong flags = args[0];
-target_ulong shift = args[1];
+int shift = args[1];
+sPAPRPendingHPT *pending = spapr->pending_hpt;
+