Excerpts from Nicholas Piggin's message of February 16, 2022 9:38 pm: > Excerpts from Cédric Le Goater's message of February 16, 2022 8:52 pm: >> On 2/16/22 11:25, Nicholas Piggin wrote: >>> This implements the Nested KVM HV hcall API for spapr under TCG. >>> >>> The L2 is switched in when the H_ENTER_NESTED hcall is made, and the >>> L1 is switched back in returned from the hcall when a HV exception >>> is sent to the vhyp. Register state is copied in and out according to >>> the nested KVM HV hcall API specification. >>> >>> The hdecr timer is started when the L2 is switched in, and it provides >>> the HDEC / 0x980 return to L1. >>> >>> The MMU re-uses the bare metal radix 2-level page table walker by >>> using the get_pate method to point the MMU to the nested partition >>> table entry. MMU faults due to partition scope errors raise HV >>> exceptions and accordingly are routed back to the L1. >>> >>> The MMU does not tag translations for the L1 (direct) vs L2 (nested) >>> guests, so the TLB is flushed on any L1<->L2 transition (hcall entry >>> and exit).> >>> Reviewed-by: Fabiano Rosas <faro...@linux.ibm.com> >>> Signed-off-by: Nicholas Piggin <npig...@gmail.com> >> >> Reviewed-by: Cédric Le Goater <c...@kaod.org> >> >> Some last comments below, > > [...] > >>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h >>> index edbf3eeed0..852fe61b36 100644 >>> --- a/include/hw/ppc/spapr.h >>> +++ b/include/hw/ppc/spapr.h >>> @@ -199,6 +199,9 @@ struct SpaprMachineState { >>> bool has_graphics; >>> uint32_t vsmt; /* Virtual SMT mode (KVM's "core stride") */ >>> >>> + /* Nested HV support (TCG only) */ >>> + uint64_t nested_ptcr; >>> + >> >> this is new state to migrate. >> > > [...] > >>> +/* Linux 64-bit powerpc pt_regs struct, used by nested HV */ >>> +struct kvmppc_pt_regs { >>> + uint64_t gpr[32]; >>> + uint64_t nip; >>> + uint64_t msr; >>> + uint64_t orig_gpr3; /* Used for restarting system calls */ >>> + uint64_t ctr; >>> + uint64_t link; >>> + uint64_t xer; >>> + uint64_t ccr; >>> + uint64_t softe; /* Soft enabled/disabled */ >>> + uint64_t trap; /* Reason for being here */ >>> + uint64_t dar; /* Fault registers */ >>> + uint64_t dsisr; /* on 4xx/Book-E used for ESR */ >>> + uint64_t result; /* Result of a system call */ >>> +}; >> >> I think we need to start moving all the spapr hcall definitions under >> spapr_hcall.h. It can come later. > > Sure. > > [...] > >>> diff --git a/include/hw/ppc/spapr_cpu_core.h >>> b/include/hw/ppc/spapr_cpu_core.h >>> index dab3dfc76c..b560514560 100644 >>> --- a/include/hw/ppc/spapr_cpu_core.h >>> +++ b/include/hw/ppc/spapr_cpu_core.h >>> @@ -48,6 +48,11 @@ typedef struct SpaprCpuState { >>> bool prod; /* not migrated, only used to improve dispatch latencies */ >>> struct ICPState *icp; >>> struct XiveTCTX *tctx; >>> + >>> + /* Fields for nested-HV support */ >>> + bool in_nested; /* true while the L2 is executing */ >>> + CPUPPCState *nested_host_state; /* holds the L1 state while L2 >>> executes */ >>> + int64_t nested_tb_offset; /* L1->L2 TB offset */ >> >> This needs a new vmstate. > > How about instead of the vmstate (we would need all the L1 state in > nested_host_state as well), we just add a migration blocker in the > L2 entry path. We could limit the max hdecr to say 1 second to > ensure it unblocks before long. > > I know migration blockers are not preferred but in this case it gives > us some iterations to debug and optimise first, which might change > the data to migrate.
This should be roughly the incremental patch to do this. Thanks, Nick -- diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 87e68da77f..14e41b7d31 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -2882,6 +2882,13 @@ static void spapr_machine_init(MachineState *machine) "may run and log hardware error on the destination"); } + if (spapr_get_cap(spapr, SPAPR_CAP_NESTED_KVM_HV) == SPAPR_CAP_ON) { + /* Create the error string for live migration blocker */ + error_setg(&spapr->nested_hv_migration_blocker, + "A nested-hv L2 guest is running. Migration is blocked until it " + "exits to the L1."); + } + if (mc->nvdimm_supported) { spapr_create_nvdimm_dr_connectors(spapr); } diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index e183892287..89295bc723 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -21,6 +21,7 @@ #include "hw/ppc/spapr_numa.h" #include "mmu-book3s-v3.h" #include "hw/mem/memory-device.h" +#include "migration/blocker.h" bool is_ram_address(SpaprMachineState *spapr, hwaddr addr) { @@ -1565,7 +1566,7 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu, SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu); target_ulong hv_ptr = args[0]; target_ulong regs_ptr = args[1]; - target_ulong hdec, now = cpu_ppc_load_tbl(env); + target_ulong hdec, now; target_ulong lpcr, lpcr_mask; struct kvmppc_hv_guest_state *hvstate; struct kvmppc_hv_guest_state hv_state; @@ -1578,11 +1579,16 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu, return H_NOT_AVAILABLE; } + if (migrate_add_blocker(spapr->nested_hv_migration_blocker, NULL)) { + return 0; /* This returns nothing to the L1, essentially an EAGAIN */ + } + len = sizeof(*hvstate); hvstate = address_space_map(CPU(cpu)->as, hv_ptr, &len, false, MEMTXATTRS_UNSPECIFIED); if (len != sizeof(*hvstate)) { address_space_unmap(CPU(cpu)->as, hvstate, len, 0, false); + migrate_del_blocker(spapr->nested_hv_migration_blocker); return H_PARAMETER; } @@ -1590,16 +1596,36 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu, address_space_unmap(CPU(cpu)->as, hvstate, len, len, false); + spapr_cpu->nested_tb_offset = hv_state.tb_offset; + spapr_cpu->nested_hdec_expiry = hv_state.hdec_expiry; + + now = cpu_ppc_load_tbl(env); + if (now >= hv_state.hdec_expiry) { + migrate_del_blocker(spapr->nested_hv_migration_blocker); + return env->excp_vectors[POWERPC_EXCP_HDECR]; + } + + hdec = hv_state.hdec_expiry - now; + if (hdec > env->tb_env->tb_freq) { + /* + * Limit hdecr to 1 second to prevent the L1 blocking migration for + * too long with a large hdecr value. + */ + hdec = env->tb_env->tb_freq; + } + /* * We accept versions 1 and 2. Version 2 fields are unused because TCG * does not implement DAWR*. */ if (hv_state.version > HV_GUEST_STATE_VERSION) { + migrate_del_blocker(spapr->nested_hv_migration_blocker); return H_PARAMETER; } spapr_cpu->nested_host_state = g_try_malloc(sizeof(CPUPPCState)); if (!spapr_cpu->nested_host_state) { + migrate_del_blocker(spapr->nested_hv_migration_blocker); return H_NO_MEM; } @@ -1611,6 +1637,7 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu, if (!regs || len != sizeof(*regs)) { address_space_unmap(CPU(cpu)->as, regs, len, 0, false); g_free(spapr_cpu->nested_host_state); + migrate_del_blocker(spapr->nested_hv_migration_blocker); return H_P2; } @@ -1648,8 +1675,6 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu, /* hv_state.amor is not used */ env->spr[SPR_DPDES] = hv_state.dpdes; env->spr[SPR_HFSCR] = hv_state.hfscr; - hdec = hv_state.hdec_expiry - now; - spapr_cpu->nested_tb_offset = hv_state.tb_offset; /* TCG does not implement DAWR*, CIABR, PURR, SPURR, IC, VTB, HEIR SPRs*/ env->spr[SPR_SRR0] = hv_state.srr0; env->spr[SPR_SRR1] = hv_state.srr1; @@ -1693,6 +1718,7 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu, void spapr_exit_nested(PowerPCCPU *cpu, int excp) { + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); CPUState *cs = CPU(cpu); CPUPPCState *env = &cpu->env; SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu); @@ -1781,6 +1807,19 @@ void spapr_exit_nested(PowerPCCPU *cpu, int excp) /* Is it okay to specify write length larger than actual data written? */ address_space_unmap(CPU(cpu)->as, regs, len, len, true); + /* + * hdecr is capped at entry, so we may exit here with a HDECR exception + * without having exceeded the guest's limit. Clear the HDECR interrupt + * return in this case. + */ + if (excp == POWERPC_EXCP_HDECR) { + target_ulong now; + now = cpu_ppc_load_tbl(env) - spapr_cpu->nested_tb_offset; + if (now < spapr_cpu->nested_hdec_expiry) { + r3_return = 0; + } + } + out_restore_l1: memcpy(env->gpr, spapr_cpu->nested_host_state->gpr, sizeof(env->gpr)); env->lr = spapr_cpu->nested_host_state->lr; @@ -1825,6 +1864,8 @@ out_restore_l1: g_free(spapr_cpu->nested_host_state); spapr_cpu->nested_host_state = NULL; + + migrate_del_blocker(spapr->nested_hv_migration_blocker); } static void hypercall_register_types(void) diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index 852fe61b36..70b330ef9a 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -266,6 +266,7 @@ struct SpaprMachineState { uint32_t FORM2_assoc_array[NUMA_NODES_MAX_NUM][FORM2_NUMA_ASSOC_SIZE]; Error *fwnmi_migration_blocker; + Error *nested_hv_migration_blocker; }; #define H_SUCCESS 0 diff --git a/include/hw/ppc/spapr_cpu_core.h b/include/hw/ppc/spapr_cpu_core.h index b560514560..09da577ca1 100644 --- a/include/hw/ppc/spapr_cpu_core.h +++ b/include/hw/ppc/spapr_cpu_core.h @@ -53,6 +53,7 @@ typedef struct SpaprCpuState { bool in_nested; /* true while the L2 is executing */ CPUPPCState *nested_host_state; /* holds the L1 state while L2 executes */ int64_t nested_tb_offset; /* L1->L2 TB offset */ + uint64_t nested_hdec_expiry; /* L1 hdec expiry in absolute L1 TB */ } SpaprCpuState; static inline SpaprCpuState *spapr_cpu_state(PowerPCCPU *cpu)