Re: [Qemu-devel] [PATCHv2 for-2.10 2/5] pseries: Implement HPT resizing
On Wed, Mar 15, 2017 at 12:44:18PM +0530, Bharata B Rao wrote: > On Tue, Mar 14, 2017 at 11:04 AM, David Gibson> wrote: > > > This patch implements hypercalls allowing a PAPR guest to resize its own > > hash page table. This will eventually allow for more flexible memory > > hotplug. > > > > The implementation is partially asynchronous, handled in a special thread > > running the hpt_prepare_thread() function. The state of a pending resize > > is stored in SPAPR_MACHINE->pending_hpt. > > > > The H_RESIZE_HPT_PREPARE hypercall will kick off creation of a new HPT, or, > > if one is already in progress, monitor it for completion. If there is an > > existing HPT resize in progress that doesn't match the size specified in > > the call, it will cancel it, replacing it with a new one matching the > > given size. > > > > The H_RESIZE_HPT_COMMIT completes transition to a resized HPT, and can only > > be called successfully once H_RESIZE_HPT_PREPARE has successfully > > completed initialization of a new HPT. The guest must ensure that there > > are no concurrent accesses to the existing HPT while this is called (this > > effectively means stop_machine() for Linux guests). > > > > For now H_RESIZE_HPT_COMMIT goes through the whole old HPT, rehashing each > > HPTE into the new HPT. This can have quite high latency, but it seems to > > be of the order of typical migration downtime latencies for HPTs of size > > up to ~2GiB (which would be used in a 256GiB guest). > > > > In future we probably want to move more of the rehashing to the "prepare" > > phase, by having H_ENTER and other hcalls update both current and > > pending HPTs. That's a project for another day, but should be possible > > without any changes to the guest interface. > > > > Signed-off-by: David Gibson > > --- > > hw/ppc/spapr.c | 4 +- > > hw/ppc/spapr_hcall.c| 306 ++ > > +- > > include/hw/ppc/spapr.h | 6 + > > target/ppc/mmu-hash64.h | 4 + > > 4 files changed, 314 insertions(+), 6 deletions(-) > > > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > > index 558109c..83db110 100644 > > --- a/hw/ppc/spapr.c > > +++ b/hw/ppc/spapr.c > > @@ -94,8 +94,6 @@ > > > > #define PHANDLE_XICP0x > > > > -#define HTAB_SIZE(spapr)(1ULL << ((spapr)->htab_shift)) > > - > > static int try_create_xics(sPAPRMachineState *spapr, const char > > *type_ics, > > const char *type_icp, int nr_servers, > > int nr_irqs, Error **errp) > > @@ -1169,7 +1167,7 @@ static void spapr_store_hpte(PPCVirtualHypervisor > > *vhyp, hwaddr ptex, > > } > > } > > > > -static int spapr_hpt_shift_for_ramsize(uint64_t ramsize) > > +int spapr_hpt_shift_for_ramsize(uint64_t ramsize) > > { > > int shift; > > > > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c > > index 9f88960..cdafc3f 100644 > > --- a/hw/ppc/spapr_hcall.c > > +++ b/hw/ppc/spapr_hcall.c > > @@ -3,6 +3,7 @@ > > #include "sysemu/hw_accel.h" > > #include "sysemu/sysemu.h" > > #include "qemu/log.h" > > +#include "qemu/error-report.h" > > #include "cpu.h" > > #include "exec/exec-all.h" > > #include "helper_regs.h" > > @@ -352,20 +353,286 @@ static target_ulong h_read(PowerPCCPU *cpu, > > sPAPRMachineState *spapr, > > return H_SUCCESS; > > } > > > > +struct sPAPRPendingHPT { > > +/* These fields are read-only after initialization */ > > +int shift; > > +QemuThread thread; > > + > > +/* These fields are protected by the BQL */ > > +bool complete; > > + > > +/* These fields are private to the preparation thread if > > + * !complete, otherwise protected by the BQL */ > > +int ret; > > +void *hpt; > > +}; > > + > > +static void free_pending_hpt(sPAPRPendingHPT *pending) > > +{ > > +if (pending->hpt) { > > +qemu_vfree(pending->hpt); > > +} > > + > > +g_free(pending); > > +} > > + > > +static void *hpt_prepare_thread(void *opaque) > > +{ > > +sPAPRPendingHPT *pending = opaque; > > +size_t size = 1ULL << pending->shift; > > + > > +pending->hpt = qemu_memalign(size, size); > > +if (pending->hpt) { > > +memset(pending->hpt, 0, size); > > +pending->ret = H_SUCCESS; > > +} else { > > +pending->ret = H_NO_MEM; > > +} > > + > > +qemu_mutex_lock_iothread(); > > + > > +if (SPAPR_MACHINE(qdev_get_machine())->pending_hpt == pending) { > > +/* Ready to go */ > > +pending->complete = true; > > +} else { > > +/* We've been cancelled, clean ourselves up */ > > +free_pending_hpt(pending); > > +} > > + > > +qemu_mutex_unlock_iothread(); > > +return NULL; > > +} > > + > > +/* Must be called with BQL held */ > > +static void cancel_hpt_prepare(sPAPRMachineState *spapr) > > +{ > > +sPAPRPendingHPT *pending = spapr->pending_hpt; > > + > > +/* Let the
Re: [Qemu-devel] [PATCHv2 for-2.10 2/5] pseries: Implement HPT resizing
On Tue, Mar 14, 2017 at 11:04 AM, David Gibsonwrote: > This patch implements hypercalls allowing a PAPR guest to resize its own > hash page table. This will eventually allow for more flexible memory > hotplug. > > The implementation is partially asynchronous, handled in a special thread > running the hpt_prepare_thread() function. The state of a pending resize > is stored in SPAPR_MACHINE->pending_hpt. > > The H_RESIZE_HPT_PREPARE hypercall will kick off creation of a new HPT, or, > if one is already in progress, monitor it for completion. If there is an > existing HPT resize in progress that doesn't match the size specified in > the call, it will cancel it, replacing it with a new one matching the > given size. > > The H_RESIZE_HPT_COMMIT completes transition to a resized HPT, and can only > be called successfully once H_RESIZE_HPT_PREPARE has successfully > completed initialization of a new HPT. The guest must ensure that there > are no concurrent accesses to the existing HPT while this is called (this > effectively means stop_machine() for Linux guests). > > For now H_RESIZE_HPT_COMMIT goes through the whole old HPT, rehashing each > HPTE into the new HPT. This can have quite high latency, but it seems to > be of the order of typical migration downtime latencies for HPTs of size > up to ~2GiB (which would be used in a 256GiB guest). > > In future we probably want to move more of the rehashing to the "prepare" > phase, by having H_ENTER and other hcalls update both current and > pending HPTs. That's a project for another day, but should be possible > without any changes to the guest interface. > > Signed-off-by: David Gibson > --- > hw/ppc/spapr.c | 4 +- > hw/ppc/spapr_hcall.c| 306 ++ > +- > include/hw/ppc/spapr.h | 6 + > target/ppc/mmu-hash64.h | 4 + > 4 files changed, 314 insertions(+), 6 deletions(-) > > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index 558109c..83db110 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -94,8 +94,6 @@ > > #define PHANDLE_XICP0x > > -#define HTAB_SIZE(spapr)(1ULL << ((spapr)->htab_shift)) > - > static int try_create_xics(sPAPRMachineState *spapr, const char > *type_ics, > const char *type_icp, int nr_servers, > int nr_irqs, Error **errp) > @@ -1169,7 +1167,7 @@ static void spapr_store_hpte(PPCVirtualHypervisor > *vhyp, hwaddr ptex, > } > } > > -static int spapr_hpt_shift_for_ramsize(uint64_t ramsize) > +int spapr_hpt_shift_for_ramsize(uint64_t ramsize) > { > int shift; > > diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c > index 9f88960..cdafc3f 100644 > --- a/hw/ppc/spapr_hcall.c > +++ b/hw/ppc/spapr_hcall.c > @@ -3,6 +3,7 @@ > #include "sysemu/hw_accel.h" > #include "sysemu/sysemu.h" > #include "qemu/log.h" > +#include "qemu/error-report.h" > #include "cpu.h" > #include "exec/exec-all.h" > #include "helper_regs.h" > @@ -352,20 +353,286 @@ static target_ulong h_read(PowerPCCPU *cpu, > sPAPRMachineState *spapr, > return H_SUCCESS; > } > > +struct sPAPRPendingHPT { > +/* These fields are read-only after initialization */ > +int shift; > +QemuThread thread; > + > +/* These fields are protected by the BQL */ > +bool complete; > + > +/* These fields are private to the preparation thread if > + * !complete, otherwise protected by the BQL */ > +int ret; > +void *hpt; > +}; > + > +static void free_pending_hpt(sPAPRPendingHPT *pending) > +{ > +if (pending->hpt) { > +qemu_vfree(pending->hpt); > +} > + > +g_free(pending); > +} > + > +static void *hpt_prepare_thread(void *opaque) > +{ > +sPAPRPendingHPT *pending = opaque; > +size_t size = 1ULL << pending->shift; > + > +pending->hpt = qemu_memalign(size, size); > +if (pending->hpt) { > +memset(pending->hpt, 0, size); > +pending->ret = H_SUCCESS; > +} else { > +pending->ret = H_NO_MEM; > +} > + > +qemu_mutex_lock_iothread(); > + > +if (SPAPR_MACHINE(qdev_get_machine())->pending_hpt == pending) { > +/* Ready to go */ > +pending->complete = true; > +} else { > +/* We've been cancelled, clean ourselves up */ > +free_pending_hpt(pending); > +} > + > +qemu_mutex_unlock_iothread(); > +return NULL; > +} > + > +/* Must be called with BQL held */ > +static void cancel_hpt_prepare(sPAPRMachineState *spapr) > +{ > +sPAPRPendingHPT *pending = spapr->pending_hpt; > + > +/* Let the thread know it's cancelled */ > +spapr->pending_hpt = NULL; > + > +if (!pending) { > +/* Nothing to do */ > +return; > +} > + > +if (!pending->complete) { > +/* thread will clean itself up */ > +return; > +} > + > +free_pending_hpt(pending); > +} > + > static target_ulong