Re: [PATCH v3 48/49] hw/i386/sev: Use guest_memfd for legacy ROMs

2024-03-20 Thread Isaku Yamahata
On Wed, Mar 20, 2024 at 03:39:44AM -0500,
Michael Roth  wrote:

> TODO: make this SNP-specific if TDX disables legacy ROMs in general

TDX disables pc.rom, not disable isa-bios. IIRC, TDX doesn't need pc pflash.
Xiaoyao can chime in.

Thanks,

> 
> Current SNP guest kernels will attempt to access these regions with
> with C-bit set, so guest_memfd is needed to handle that. Otherwise,
> kvm_convert_memory() will fail when the guest kernel tries to access it
> and QEMU attempts to call KVM_SET_MEMORY_ATTRIBUTES to set these ranges
> to private.
> 
> Whether guests should actually try to access ROM regions in this way (or
> need to deal with legacy ROM regions at all), is a separate issue to be
> addressed on kernel side, but current SNP guest kernels will exhibit
> this behavior and so this handling is needed to allow QEMU to continue
> running existing SNP guest kernels.
> 
> Signed-off-by: Michael Roth 
> ---
>  hw/i386/pc.c   | 13 +
>  hw/i386/pc_sysfw.c | 13 ++---
>  2 files changed, 19 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index feb7a93083..5feaeb43ee 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1011,10 +1011,15 @@ void pc_memory_init(PCMachineState *pcms,
>  pc_system_firmware_init(pcms, rom_memory);
>  
>  option_rom_mr = g_malloc(sizeof(*option_rom_mr));
> -memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE,
> -   _fatal);
> -if (pcmc->pci_enabled) {
> -memory_region_set_readonly(option_rom_mr, true);
> +if (machine_require_guest_memfd(machine)) {
> +memory_region_init_ram_guest_memfd(option_rom_mr, NULL, "pc.rom",
> +   PC_ROM_SIZE, _fatal);
> +} else {
> +memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE,
> +   _fatal);
> +if (pcmc->pci_enabled) {
> +memory_region_set_readonly(option_rom_mr, true);
> +}
>  }
>  memory_region_add_subregion_overlap(rom_memory,
>  PC_ROM_MIN_VGA,
> diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
> index 9dbb3f7337..850f86edd4 100644
> --- a/hw/i386/pc_sysfw.c
> +++ b/hw/i386/pc_sysfw.c
> @@ -54,8 +54,13 @@ static void pc_isa_bios_init(MemoryRegion *rom_memory,
>  /* map the last 128KB of the BIOS in ISA space */
>  isa_bios_size = MIN(flash_size, 128 * KiB);
>  isa_bios = g_malloc(sizeof(*isa_bios));
> -memory_region_init_ram(isa_bios, NULL, "isa-bios", isa_bios_size,
> -   _fatal);
> +if (machine_require_guest_memfd(current_machine)) {
> +memory_region_init_ram_guest_memfd(isa_bios, NULL, "isa-bios",
> +   isa_bios_size, _fatal);
> +} else {
> +memory_region_init_ram(isa_bios, NULL, "isa-bios", isa_bios_size,
> +   _fatal);
> +}
>  memory_region_add_subregion_overlap(rom_memory,
>  0x10 - isa_bios_size,
>  isa_bios,
> @@ -68,7 +73,9 @@ static void pc_isa_bios_init(MemoryRegion *rom_memory,
> ((uint8_t*)flash_ptr) + (flash_size - isa_bios_size),
> isa_bios_size);
>  
> -    memory_region_set_readonly(isa_bios, true);
> +if (!machine_require_guest_memfd(current_machine)) {
> +memory_region_set_readonly(isa_bios, true);
> +}
>  }
>  
>  static PFlashCFI01 *pc_pflash_create(PCMachineState *pcms,
> -- 
> 2.25.1
> 
> 

-- 
Isaku Yamahata 



Re: [PATCH v3 40/49] hw/i386/sev: Add function to get SEV metadata from OVMF header

2024-03-20 Thread Isaku Yamahata
On Wed, Mar 20, 2024 at 03:39:36AM -0500,
Michael Roth  wrote:

> From: Brijesh Singh 
> 
> A recent version of OVMF expanded the reset vector GUID list to add
> SEV-specific metadata GUID. The SEV metadata describes the reserved
> memory regions such as the secrets and CPUID page used during the SEV-SNP
> guest launch.
> 
> The pc_system_get_ovmf_sev_metadata_ptr() is used to retieve the SEV
> metadata pointer from the OVMF GUID list.
> 
> Signed-off-by: Brijesh Singh 
> Signed-off-by: Michael Roth 
> ---
>  hw/i386/pc_sysfw_ovmf.c | 33 +
>  include/hw/i386/pc.h| 26 ++
>  2 files changed, 59 insertions(+)
> 
> diff --git a/hw/i386/pc_sysfw_ovmf.c b/hw/i386/pc_sysfw_ovmf.c
> index 07a4c267fa..32efa34614 100644
> --- a/hw/i386/pc_sysfw_ovmf.c
> +++ b/hw/i386/pc_sysfw_ovmf.c
> @@ -35,6 +35,31 @@ static const int bytes_after_table_footer = 32;
>  static bool ovmf_flash_parsed;
>  static uint8_t *ovmf_table;
>  static int ovmf_table_len;
> +static OvmfSevMetadata *ovmf_sev_metadata_table;
> +
> +#define OVMF_SEV_META_DATA_GUID "dc886566-984a-4798-A75e-5585a7bf67cc"
> +typedef struct __attribute__((__packed__)) OvmfSevMetadataOffset {
> +uint32_t offset;
> +} OvmfSevMetadataOffset;
> +
> +static void pc_system_parse_sev_metadata(uint8_t *flash_ptr, size_t 
> flash_size)
> +{
> +OvmfSevMetadata *metadata;
> +OvmfSevMetadataOffset  *data;
> +
> +if (!pc_system_ovmf_table_find(OVMF_SEV_META_DATA_GUID, (uint8_t 
> **),
> +   NULL)) {
> +return;
> +}
> +
> +metadata = (OvmfSevMetadata *)(flash_ptr + flash_size - data->offset);
> +if (memcmp(metadata->signature, "ASEV", 4) != 0) {
> +return;
> +}
> +
> +ovmf_sev_metadata_table = g_malloc(metadata->len);
> +memcpy(ovmf_sev_metadata_table, metadata, metadata->len);
> +}
>  
>  void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size)
>  {
> @@ -90,6 +115,9 @@ void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t 
> flash_size)
>   */
>  memcpy(ovmf_table, ptr - tot_len, tot_len);
>  ovmf_table += tot_len;
> +
> +/* Copy the SEV metadata table (if exist) */
> +pc_system_parse_sev_metadata(flash_ptr, flash_size);
>  }

Can we move this call to x86_firmware_configure() @ pc_sysfw.c, and move sev
specific bits to somewhere to sev specific file?  We don't have to parse sev
metadata for non-SEV case, right?

We don't have to touch common ovmf file. It also will be consistent with tdx
case.  TDX patch series adds tdx_parse_tdvf() to x86_firmware_configure().

thanks,

>  
>  /**
> @@ -159,3 +187,8 @@ bool pc_system_ovmf_table_find(const char *entry, uint8_t 
> **data,
>  }
>  return false;
>  }
> +
> +OvmfSevMetadata *pc_system_get_ovmf_sev_metadata_ptr(void)
> +{
> +return ovmf_sev_metadata_table;
> +}
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index fb1d4106e5..df9a61540d 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -163,6 +163,32 @@ void pc_acpi_smi_interrupt(void *opaque, int irq, int 
> level);
>  #define PCI_HOST_ABOVE_4G_MEM_SIZE "above-4g-mem-size"
>  #define PCI_HOST_PROP_SMM_RANGES   "smm-ranges"
>  
> +typedef enum {
> +SEV_DESC_TYPE_UNDEF,
> +/* The section contains the region that must be validated by the VMM. */
> +SEV_DESC_TYPE_SNP_SEC_MEM,
> +/* The section contains the SNP secrets page */
> +SEV_DESC_TYPE_SNP_SECRETS,
> +/* The section contains address that can be used as a CPUID page */
> +SEV_DESC_TYPE_CPUID,
> +
> +} ovmf_sev_metadata_desc_type;
> +
> +typedef struct __attribute__((__packed__)) OvmfSevMetadataDesc {
> +uint32_t base;
> +uint32_t len;
> +ovmf_sev_metadata_desc_type type;
> +} OvmfSevMetadataDesc;
> +
> +typedef struct __attribute__((__packed__)) OvmfSevMetadata {
> +uint8_t signature[4];
> +uint32_t len;
> +uint32_t version;
> +uint32_t num_desc;
> +OvmfSevMetadataDesc descs[];
> +} OvmfSevMetadata;
> +
> +OvmfSevMetadata *pc_system_get_ovmf_sev_metadata_ptr(void);
>  
>  void pc_pci_as_mapping_init(MemoryRegion *system_memory,
>  MemoryRegion *pci_address_space);
> -- 
> 2.25.1
> 
> 

-- 
Isaku Yamahata 



Re: [PATCH v5 15/65] i386/tdx: Get tdx_capabilities via KVM_TDX_CAPABILITIES

2024-03-12 Thread Isaku Yamahata
On Thu, Feb 29, 2024 at 01:36:36AM -0500,
Xiaoyao Li  wrote:

> KVM provides TDX capabilities via sub command KVM_TDX_CAPABILITIES of
> IOCTL(KVM_MEMORY_ENCRYPT_OP). Get the capabilities when initializing
> TDX context. It will be used to validate user's setting later.
> 
> Since there is no interface reporting how many cpuid configs contains in
> KVM_TDX_CAPABILITIES, QEMU chooses to try starting with a known number
> and abort when it exceeds KVM_MAX_CPUID_ENTRIES.
> 
> Besides, introduce the interfaces to invoke TDX "ioctls" at different
> scope (KVM, VM and VCPU) in preparation.
> 
> Signed-off-by: Xiaoyao Li 
> ---
> Changes in v4:
> - use {} to initialize struct kvm_tdx_cmd, to avoid memset();
> - remove tdx_platform_ioctl() because no user;
> 
> Changes in v3:
> - rename __tdx_ioctl() to tdx_ioctl_internal()
> - Pass errp in get_tdx_capabilities();
> 
> changes in v2:
>   - Make the error message more clear;
> 
> changes in v1:
>   - start from nr_cpuid_configs = 6 for the loop;
>   - stop the loop when nr_cpuid_configs exceeds KVM_MAX_CPUID_ENTRIES;
> ---
>  target/i386/kvm/kvm.c  |  2 -
>  target/i386/kvm/kvm_i386.h |  2 +
>  target/i386/kvm/tdx.c  | 91 +-
>  3 files changed, 92 insertions(+), 3 deletions(-)
> 
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 52d99d30bdc8..0e68e80f4291 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -1685,8 +1685,6 @@ static int hyperv_init_vcpu(X86CPU *cpu)
>  
>  static Error *invtsc_mig_blocker;
>  
> -#define KVM_MAX_CPUID_ENTRIES  100
> -
>  static void kvm_init_xsave(CPUX86State *env)
>  {
>  if (has_xsave2) {
> diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h
> index 55fb25fa8e2e..c3ef46a97a7b 100644
> --- a/target/i386/kvm/kvm_i386.h
> +++ b/target/i386/kvm/kvm_i386.h
> @@ -13,6 +13,8 @@
>  
>  #include "sysemu/kvm.h"
>  
> +#define KVM_MAX_CPUID_ENTRIES  100
> +
>  #ifdef CONFIG_KVM
>  
>  #define kvm_pit_in_kernel() \
> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index d9a1dd46dc69..2b956450a083 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -12,18 +12,107 @@
>   */
>  
>  #include "qemu/osdep.h"
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
>  #include "qom/object_interfaces.h"
> +#include "sysemu/kvm.h"
>  
>  #include "hw/i386/x86.h"
> +#include "kvm_i386.h"
>  #include "tdx.h"
>  
> +static struct kvm_tdx_capabilities *tdx_caps;
> +
> +enum tdx_ioctl_level{
> +TDX_VM_IOCTL,
> +TDX_VCPU_IOCTL,
> +};
> +
> +static int tdx_ioctl_internal(void *state, enum tdx_ioctl_level level, int 
> cmd_id,
> +__u32 flags, void *data)
> +{
> +struct kvm_tdx_cmd tdx_cmd = {};
> +int r;
> +
> +tdx_cmd.id = cmd_id;
> +tdx_cmd.flags = flags;
> +tdx_cmd.data = (__u64)(unsigned long)data;
> +
> +switch (level) {
> +case TDX_VM_IOCTL:
> +r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, _cmd);
> +break;
> +case TDX_VCPU_IOCTL:
> +r = kvm_vcpu_ioctl(state, KVM_MEMORY_ENCRYPT_OP, _cmd);
> +break;
> +default:
> +error_report("Invalid tdx_ioctl_level %d", level);
> +exit(1);
> +}
> +
> +return r;
> +}
> +
> +static inline int tdx_vm_ioctl(int cmd_id, __u32 flags, void *data)
> +{
> +return tdx_ioctl_internal(NULL, TDX_VM_IOCTL, cmd_id, flags, data);
> +}
> +
> +static inline int tdx_vcpu_ioctl(void *vcpu_fd, int cmd_id, __u32 flags,
> + void *data)

As kvm_vcpu_ioctl(CPUState *cpu, int type, ...) takes CPUState *, this can be
tdx_vcpu_ioctl(CPUState *cpu, ) instead of void *.
I struggled to fin my mistake to pass "int vcpu_fd" to this function.
-- 
Isaku Yamahata 



Re: [PATCH v5 49/65] i386/tdx: handle TDG.VP.VMCALL

2024-03-12 Thread Isaku Yamahata
On Tue, Mar 12, 2024 at 03:44:32PM +0800,
Xiaoyao Li  wrote:

> On 3/11/2024 5:27 PM, Daniel P. Berrangé wrote:
> > On Thu, Feb 29, 2024 at 01:37:10AM -0500, Xiaoyao Li wrote:
> > > From: Isaku Yamahata 
> > > 
> > > Add property "quote-generation-socket" to tdx-guest, which is a property
> > > of type SocketAddress to specify Quote Generation Service(QGS).
> > > 
> > > On request of GetQuote, it connects to the QGS socket, read request
> > > data from shared guest memory, send the request data to the QGS,
> > > and store the response into shared guest memory, at last notify
> > > TD guest by interrupt.
> > > 
> > > command line example:
> > >qemu-system-x86_64 \
> > >  -object 
> > > '{"qom-type":"tdx-guest","id":"tdx0","quote-generation-socket":{"type": 
> > > "vsock", "cid":"1","port":"1234"}}' \
> > 
> > Can you illustrate this with 'unix' sockets, not 'vsock'.
> 
> Are you suggesting only updating the commit message to an example of unix
> socket? Or you want the code to test with some unix socket QGS?
> 
> (It seems the QGS I got for testing, only supports vsock socket. Because at
> the time when it got developed, it was supposed to communicate with drivers
> inside TD guest directly not via VMM (KVM+QEMU). Anyway, I will talk to
> internal folks to see if any plan to support unix socket.)

You can use small utility to proxy sockets for testing purpose. The famous one
is socat or nmap ncat. They support vsock. At least their latest version does.

QGS <-vsock-> socat <-unix domain-> qemu
-- 
Isaku Yamahata 



Re: [PATCH v5 08/65] kvm: handle KVM_EXIT_MEMORY_FAULT

2024-03-05 Thread Isaku Yamahata
On Thu, Feb 29, 2024 at 01:36:29AM -0500,
Xiaoyao Li  wrote:

> From: Chao Peng 
> 
> When geeting KVM_EXIT_MEMORY_FAULT exit, it indicates userspace needs to
> do the memory conversion on the RAMBlock to turn the memory into desired
> attribute, i.e., private/shared.
> 
> Currently only KVM_MEMORY_EXIT_FLAG_PRIVATE in flags is valid when
> KVM_EXIT_MEMORY_FAULT happens.
> 
> Note, KVM_EXIT_MEMORY_FAULT makes sense only when the RAMBlock has
> guest_memfd memory backend.
> 
> Note, KVM_EXIT_MEMORY_FAULT returns with -EFAULT, so special handling is
> added.
> 
> When page is converted from shared to private, the original shared
> memory can be discarded via ram_block_discard_range(). Note, shared
> memory can be discarded only when it's not back'ed by hugetlb because
> hugetlb is supposed to be pre-allocated and no need for discarding.
> 
> Signed-off-by: Chao Peng 
> Co-developed-by: Xiaoyao Li 
> Signed-off-by: Xiaoyao Li 
> 
> ---
> Changes in v4:
> - open-coded ram_block_discard logic;
> - change warn_report() to error_report(); (Daniel)
> ---
>  accel/kvm/kvm-all.c | 94 -
>  1 file changed, 84 insertions(+), 10 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 70d482a2c936..87e4275932a7 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -2903,6 +2903,68 @@ static void kvm_eat_signals(CPUState *cpu)
>  } while (sigismember(, SIG_IPI));
>  }
>  
> +static int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
> +{
> +MemoryRegionSection section;
> +ram_addr_t offset;
> +MemoryRegion *mr;
> +RAMBlock *rb;
> +void *addr;
> +int ret = -1;
> +
> +if (!QEMU_PTR_IS_ALIGNED(start, qemu_host_page_size) ||
> +!QEMU_PTR_IS_ALIGNED(size, qemu_host_page_size)) {
> +return -1;
> +}
> +
> +if (!size) {
> +return -1;
> +}
> +
> +section = memory_region_find(get_system_memory(), start, size);
> +mr = section.mr;
> +if (!mr) {
> +return -1;
> +}
> +
> +if (memory_region_has_guest_memfd(mr)) {
> +if (to_private) {
> +ret = kvm_set_memory_attributes_private(start, size);
> +} else {
> +ret = kvm_set_memory_attributes_shared(start, size);
> +}
> +
> +if (ret) {
> +memory_region_unref(section.mr);
> +return ret;
> +}
> +
> +addr = memory_region_get_ram_ptr(mr) + section.offset_within_region;
> +rb = qemu_ram_block_from_host(addr, false, );
> +
> +if (to_private) {
> +if (rb->page_size != qemu_host_page_size) {
> +/*
> +* shared memory is back'ed by  hugetlb, which is supposed to 
> be
> +* pre-allocated and doesn't need to be discarded
> +*/
> +return 0;

The reference count leaks. Add memory_region_unref() is needed.

Otherwise looks good to me.
Reviewed-by: Isaku Yamahata 
-- 
Isaku Yamahata 



Re: [PATCH v3 19/70] i386/tdx: Introduce is_tdx_vm() helper and cache tdx_guest object

2023-11-17 Thread Isaku Yamahata
On Wed, Nov 15, 2023 at 02:14:28AM -0500,
Xiaoyao Li  wrote:

> It will need special handling for TDX VMs all around the QEMU.
> Introduce is_tdx_vm() helper to query if it's a TDX VM.
> 
> Cache tdx_guest object thus no need to cast from ms->cgs every time.
> 
> Signed-off-by: Xiaoyao Li 
> Acked-by: Gerd Hoffmann 
> ---
> changes in v3:
> - replace object_dynamic_cast with TDX_GUEST();
> ---
>  target/i386/kvm/tdx.c | 15 ++-
>  target/i386/kvm/tdx.h | 10 ++
>  2 files changed, 24 insertions(+), 1 deletion(-)
> 
> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index cb0040187b27..cf8889f0a8f9 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -21,8 +21,16 @@
>  #include "kvm_i386.h"
>  #include "tdx.h"
>  
> +static TdxGuest *tdx_guest;
> +
>  static struct kvm_tdx_capabilities *tdx_caps;
>  
> +/* It's valid after kvm_confidential_guest_init()->kvm_tdx_init() */
> +bool is_tdx_vm(void)
> +{
> +return !!tdx_guest;
> +}
> +
>  enum tdx_ioctl_level{
>  TDX_PLATFORM_IOCTL,
>  TDX_VM_IOCTL,
> @@ -114,15 +122,20 @@ static int get_tdx_capabilities(Error **errp)
>  
>  int tdx_kvm_init(MachineState *ms, Error **errp)
>  {
> +TdxGuest *tdx = TDX_GUEST(OBJECT(ms->cgs));
>  int r = 0;
>  
>  ms->require_guest_memfd = true;
>  
>  if (!tdx_caps) {
>  r = get_tdx_capabilities(errp);
> +if (r) {
> +return r;
> +}
>  }
>  
> -return r;
> +tdx_guest = tdx;
> +return 0;
>  }
>  
>  /* tdx guest */
> diff --git a/target/i386/kvm/tdx.h b/target/i386/kvm/tdx.h
> index c8a23d95258d..4036ca2f3f99 100644
> --- a/target/i386/kvm/tdx.h
> +++ b/target/i386/kvm/tdx.h
> @@ -1,6 +1,10 @@
>  #ifndef QEMU_I386_TDX_H
>  #define QEMU_I386_TDX_H
>  
> +#ifndef CONFIG_USER_ONLY
> +#include CONFIG_DEVICES /* CONFIG_TDX */
> +#endif
> +
>  #include "exec/confidential-guest-support.h"
>  
>  #define TYPE_TDX_GUEST "tdx-guest"
> @@ -16,6 +20,12 @@ typedef struct TdxGuest {
>  uint64_t attributes;    /* TD attributes */
>  } TdxGuest;
>  
> +#ifdef CONFIG_TDX
> +bool is_tdx_vm(void);
> +#else
> +#define is_tdx_vm() 0
> +#endif /* CONFIG_TDX */
> +
>  int tdx_kvm_init(MachineState *ms, Error **errp);
>  
>  #endif /* QEMU_I386_TDX_H */
> -- 
> 2.34.1
> 
> 

Reviewed-by: Isaku Yamahata 
-- 
Isaku Yamahata 



Re: [PATCH v3 18/70] i386/tdx: Get tdx_capabilities via KVM_TDX_CAPABILITIES

2023-11-17 Thread Isaku Yamahata
On Wed, Nov 15, 2023 at 02:14:27AM -0500,
Xiaoyao Li  wrote:

> KVM provides TDX capabilities via sub command KVM_TDX_CAPABILITIES of
> IOCTL(KVM_MEMORY_ENCRYPT_OP). Get the capabilities when initializing
> TDX context. It will be used to validate user's setting later.
> 
> Since there is no interface reporting how many cpuid configs contains in
> KVM_TDX_CAPABILITIES, QEMU chooses to try starting with a known number
> and abort when it exceeds KVM_MAX_CPUID_ENTRIES.
> 
> Besides, introduce the interfaces to invoke TDX "ioctls" at different
> scope (KVM, VM and VCPU) in preparation.
> 
> Signed-off-by: Xiaoyao Li 
> ---
> Changes in v3:
> - rename __tdx_ioctl() to tdx_ioctl_internal()
> - Pass errp in get_tdx_capabilities();
> 
> changes in v2:
>   - Make the error message more clear;
> 
> changes in v1:
>   - start from nr_cpuid_configs = 6 for the loop;
>   - stop the loop when nr_cpuid_configs exceeds KVM_MAX_CPUID_ENTRIES;
> ---
>  target/i386/kvm/kvm.c  |   2 -
>  target/i386/kvm/kvm_i386.h |   2 +
>  target/i386/kvm/tdx.c  | 102 -
>  3 files changed, 103 insertions(+), 3 deletions(-)
> 
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 7abcdebb1452..28e60c5ea4a7 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -1687,8 +1687,6 @@ static int hyperv_init_vcpu(X86CPU *cpu)
>  
>  static Error *invtsc_mig_blocker;
>  
> -#define KVM_MAX_CPUID_ENTRIES  100
> -
>  static void kvm_init_xsave(CPUX86State *env)
>  {
>  if (has_xsave2) {
> diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h
> index 55fb25fa8e2e..c3ef46a97a7b 100644
> --- a/target/i386/kvm/kvm_i386.h
> +++ b/target/i386/kvm/kvm_i386.h
> @@ -13,6 +13,8 @@
>  
>  #include "sysemu/kvm.h"
>  
> +#define KVM_MAX_CPUID_ENTRIES  100
> +
>  #ifdef CONFIG_KVM
>  
>  #define kvm_pit_in_kernel() \
> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index 621a05beeb4e..cb0040187b27 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -12,17 +12,117 @@
>   */
>  
>  #include "qemu/osdep.h"
> +#include "qemu/error-report.h"
>  #include "qapi/error.h"
>  #include "qom/object_interfaces.h"
> +#include "sysemu/kvm.h"
>  
>  #include "hw/i386/x86.h"
> +#include "kvm_i386.h"
>  #include "tdx.h"
>  
> +static struct kvm_tdx_capabilities *tdx_caps;
> +
> +enum tdx_ioctl_level{
> +TDX_PLATFORM_IOCTL,
> +TDX_VM_IOCTL,
> +TDX_VCPU_IOCTL,
> +};
> +
> +static int tdx_ioctl_internal(void *state, enum tdx_ioctl_level level, int 
> cmd_id,
> +__u32 flags, void *data)
> +{
> +struct kvm_tdx_cmd tdx_cmd;
> +int r;
> +
> +memset(_cmd, 0x0, sizeof(tdx_cmd));
> +
> +tdx_cmd.id = cmd_id;
> +tdx_cmd.flags = flags;
> +tdx_cmd.data = (__u64)(unsigned long)data;
> +
> +switch (level) {
> +case TDX_PLATFORM_IOCTL:
> +r = kvm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, _cmd);
> +break;
> +case TDX_VM_IOCTL:
> +r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, _cmd);
> +break;
> +case TDX_VCPU_IOCTL:
> +r = kvm_vcpu_ioctl(state, KVM_MEMORY_ENCRYPT_OP, _cmd);
> +break;
> +default:
> +error_report("Invalid tdx_ioctl_level %d", level);
> +exit(1);
> +}
> +
> +return r;
> +}
> +
> +static inline int tdx_platform_ioctl(int cmd_id, __u32 flags, void *data)
> +{
> +return tdx_ioctl_internal(NULL, TDX_PLATFORM_IOCTL, cmd_id, flags, data);
> +}
> +
> +static inline int tdx_vm_ioctl(int cmd_id, __u32 flags, void *data)
> +{
> +return tdx_ioctl_internal(NULL, TDX_VM_IOCTL, cmd_id, flags, data);
> +}
> +
> +static inline int tdx_vcpu_ioctl(void *vcpu_fd, int cmd_id, __u32 flags,
> + void *data)
> +{
> +return  tdx_ioctl_internal(vcpu_fd, TDX_VCPU_IOCTL, cmd_id, flags, data);
> +}

As all of ioctl variants aren't used yet, we can split out them. An independent
patch to define ioctl functions.


> +
> +static int get_tdx_capabilities(Error **errp)
> +{
> +struct kvm_tdx_capabilities *caps;
> +/* 1st generation of TDX reports 6 cpuid configs */
> +int nr_cpuid_configs = 6;
> +size_t size;
> +int r;
> +
> +do {
> +size = sizeof(struct kvm_tdx_capabilities) +
> +   nr_cpuid_configs * sizeof(struct kvm_tdx_cpuid_config);
> +caps = g_malloc0(size);
> +caps->nr_cpuid_configs = nr_cpuid_configs;
> +
> +r = tdx_vm_ioctl(KVM_TDX_CAPABILITIES, 0, caps);
> +if (r == -E2BIG) {
> +g_free(caps);
> +nr_cpuid_configs *= 2;

g_realloc()?  Maybe a matter of preference.

Other than this, it looks good to me.
-- 
Isaku Yamahata 



Re: [PATCH v3 09/70] physmem: Introduce ram_block_convert_range() for page conversion

2023-11-17 Thread Isaku Yamahata
On Wed, Nov 15, 2023 at 02:14:18AM -0500,
Xiaoyao Li  wrote:

> It's used for discarding opposite memory after memory conversion, for
> confidential guest.
> 
> When page is converted from shared to private, the original shared
> memory can be discarded via ram_block_discard_range();
> 
> When page is converted from private to shared, the original private
> memory is back'ed by guest_memfd. Introduce
> ram_block_discard_guest_memfd_range() for discarding memory in
> guest_memfd.
> 
> Originally-from: Isaku Yamahata 
> Codeveloped-by: Xiaoyao Li 
> Signed-off-by: Xiaoyao Li 
> ---
>  include/exec/cpu-common.h |  2 ++
>  system/physmem.c  | 50 +++
>  2 files changed, 52 insertions(+)
> 
> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
> index 41115d891940..de728a18eef2 100644
> --- a/include/exec/cpu-common.h
> +++ b/include/exec/cpu-common.h
> @@ -175,6 +175,8 @@ typedef int (RAMBlockIterFunc)(RAMBlock *rb, void 
> *opaque);
>  
>  int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque);
>  int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length);
> +int ram_block_convert_range(RAMBlock *rb, uint64_t start, size_t length,
> +bool shared_to_private);
>  
>  #endif
>  
> diff --git a/system/physmem.c b/system/physmem.c
> index ddfecddefcd6..cd6008fa09ad 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -3641,6 +3641,29 @@ err:
>  return ret;
>  }
>  
> +static int ram_block_discard_guest_memfd_range(RAMBlock *rb, uint64_t start,
> +   size_t length)
> +{
> +int ret = -1;
> +
> +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
> +ret = fallocate(rb->guest_memfd, FALLOC_FL_PUNCH_HOLE | 
> FALLOC_FL_KEEP_SIZE,
> +start, length);
> +
> +if (ret) {
> +ret = -errno;
> +error_report("%s: Failed to fallocate %s:%" PRIx64 " +%zx (%d)",
> + __func__, rb->idstr, start, length, ret);
> +}
> +#else
> +ret = -ENOSYS;
> +error_report("%s: fallocate not available %s:%" PRIx64 " +%zx (%d)",
> + __func__, rb->idstr, start, length, ret);
> +#endif
> +
> +return ret;
> +}
> +
>  bool ramblock_is_pmem(RAMBlock *rb)
>  {
>  return rb->flags & RAM_PMEM;
> @@ -3828,3 +3851,30 @@ bool ram_block_discard_is_required(void)
>  return qatomic_read(_block_discard_required_cnt) ||
> qatomic_read(_block_coordinated_discard_required_cnt);
>  }
> +
> +int ram_block_convert_range(RAMBlock *rb, uint64_t start, size_t length,
> +bool shared_to_private)
> +{
> +if (!rb || rb->guest_memfd < 0) {
> +return -1;
> +}
> +
> +if (!QEMU_PTR_IS_ALIGNED(start, qemu_host_page_size) ||
> +!QEMU_PTR_IS_ALIGNED(length, qemu_host_page_size)) {
> +return -1;
> +}
> +
> +if (!length) {
> +return -1;
> +}
> +
> +if (start + length > rb->max_length) {
> +return -1;
> +}
> +
> +if (shared_to_private) {
> +return ram_block_discard_range(rb, start, length);
> +} else {
> +return ram_block_discard_guest_memfd_range(rb, start, length);
> +}
> +}

Originally this function issued KVM_SET_MEMORY_ATTRIBUTES, the function name
mad sense. But now it doesn't, and it issues only punch hole. We should rename
it to represent what it actually does. discard_range?
-- 
Isaku Yamahata 



Re: [PATCH v3 05/70] kvm: Enable KVM_SET_USER_MEMORY_REGION2 for memslot

2023-11-17 Thread Isaku Yamahata
On Wed, Nov 15, 2023 at 02:14:14AM -0500,
Xiaoyao Li  wrote:

> From: Chao Peng 
> 
> Switch to KVM_SET_USER_MEMORY_REGION2 when supported by KVM.
> 
> With KVM_SET_USER_MEMORY_REGION2, QEMU can set up memory region that
> backend'ed both by hva-based shared memory and guest memfd based private
> memory.
> 
> Signed-off-by: Chao Peng 
> Co-developed-by: Xiaoyao Li 
> Signed-off-by: Xiaoyao Li 
> ---
>  accel/kvm/kvm-all.c  | 56 ++--
>  accel/kvm/trace-events   |  2 +-
>  include/sysemu/kvm_int.h |  2 ++
>  3 files changed, 51 insertions(+), 9 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 9f751d4971f8..69afeb47c9c0 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -293,35 +293,69 @@ int kvm_physical_memory_addr_from_host(KVMState *s, 
> void *ram,
>  static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, 
> bool new)
>  {
>  KVMState *s = kvm_state;
> -struct kvm_userspace_memory_region mem;
> +struct kvm_userspace_memory_region2 mem;
> +static int cap_user_memory2 = -1;
>  int ret;
>  
> +if (cap_user_memory2 == -1) {
> +cap_user_memory2 = kvm_check_extension(s, KVM_CAP_USER_MEMORY2);
> +}
> +
> +if (!cap_user_memory2 && slot->guest_memfd >= 0) {
> +error_report("%s, KVM doesn't support KVM_CAP_USER_MEMORY2,"
> + " which is required by guest memfd!", __func__);
> +exit(1);
> +}
> +
>  mem.slot = slot->slot | (kml->as_id << 16);
>  mem.guest_phys_addr = slot->start_addr;
>  mem.userspace_addr = (unsigned long)slot->ram;
>  mem.flags = slot->flags;
> +mem.guest_memfd = slot->guest_memfd;
> +mem.guest_memfd_offset = slot->guest_memfd_offset;
>  
>  if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & 
> KVM_MEM_READONLY) {
>  /* Set the slot size to 0 before setting the slot to the desired
>   * value. This is needed based on KVM commit 75d61fbc. */
>  mem.memory_size = 0;
> -ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, );
> +
> +if (cap_user_memory2) {
> +ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, );
> +} else {
> +ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, );
> + }
>  if (ret < 0) {
>  goto err;
>  }
>  }
>  mem.memory_size = slot->memory_size;
> -ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, );
> +if (cap_user_memory2) {
> +ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, );
> +} else {
> +ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, );
> +}
>  slot->old_flags = mem.flags;
>  err:
>  trace_kvm_set_user_memory(mem.slot >> 16, (uint16_t)mem.slot, mem.flags,
>mem.guest_phys_addr, mem.memory_size,
> -  mem.userspace_addr, ret);
> +  mem.userspace_addr, mem.guest_memfd,
> +  mem.guest_memfd_offset, ret);
>  if (ret < 0) {
> -error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
> - " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
> - __func__, mem.slot, slot->start_addr,
> - (uint64_t)mem.memory_size, strerror(errno));
> +if (cap_user_memory2) {
> +error_report("%s: KVM_SET_USER_MEMORY_REGION2 failed, 
> slot=%d,"
> +" start=0x%" PRIx64 ", size=0x%" PRIx64 ","
> +" flags=0x%" PRIx32 ", guest_memfd=%" PRId32 ","
> +" guest_memfd_offset=0x%" PRIx64 ": %s",
> +__func__, mem.slot, slot->start_addr,
> +(uint64_t)mem.memory_size, mem.flags,
> +mem.guest_memfd, (uint64_t)mem.guest_memfd_offset,
> +strerror(errno));
> +} else {
> +error_report("%s: KVM_SET_USER_MEMORY_REGION failed, 
> slot=%d,"
> +" start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
> +    __func__, mem.slot, slot->start_addr,
> +(uint64_t)mem.memory_size, strerror(errno));
> +}
>  }
>  return ret;
>  }
> @@ -477,6 +511,9 @@ static int kvm_mem_flags(MemoryRegion *mr)
>  if (readonly && kvm_readonly_mem_allowed) {
>  flags |= KVM_MEM_READONLY;
>  }
> +if (memory_region_has_guest_memfd(mr)) {
> +flags |= KVM_MEM_PRIVATE;
> +}

Nitpick: it was renamed to KVM_MEM_GUEST_MEMFD
As long as the value is defined to same value, it doesn't matter, though.
-- 
Isaku Yamahata 



Re: [PATCH v3 02/70] RAMBlock: Add support of KVM private guest memfd

2023-11-17 Thread Isaku Yamahata
On Wed, Nov 15, 2023 at 02:14:11AM -0500,
Xiaoyao Li  wrote:

> diff --git a/system/physmem.c b/system/physmem.c
> index fc2b0fee0188..0af2213cbd9c 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -1841,6 +1841,20 @@ static void ram_block_add(RAMBlock *new_block, Error 
> **errp)
>  }
>  }
>  
> +#ifdef CONFIG_KVM
> +if (kvm_enabled() && new_block->flags & RAM_GUEST_MEMFD &&
> +new_block->guest_memfd < 0) {
> +/* TODO: to decide if KVM_GUEST_MEMFD_ALLOW_HUGEPAGE is supported */
> +uint64_t flags = 0;
> +new_block->guest_memfd = 
> kvm_create_guest_memfd(new_block->max_length,
> +flags, errp);
> +if (new_block->guest_memfd < 0) {
> +qemu_mutex_unlock_ramlist();
> +return;
> +}
> +}
> +#endif
> +

We should define kvm_create_guest_memfd() stub in accel/stub/kvm-stub.c.
We can remove this #ifdef.
-- 
Isaku Yamahata 



Re: [PATCH v2 13/58] kvm: Introduce kvm_arch_pre_create_vcpu()

2023-08-30 Thread Isaku Yamahata
On Wed, Aug 30, 2023 at 09:45:58AM +0800,
Xiaoyao Li  wrote:

> On 8/29/2023 10:40 PM, Philippe Mathieu-Daudé wrote:
> > On 18/8/23 11:49, Xiaoyao Li wrote:
> > > Introduce kvm_arch_pre_create_vcpu(), to perform arch-dependent
> > > work prior to create any vcpu. This is for i386 TDX because it needs
> > > call TDX_INIT_VM before creating any vcpu.
> > > 
> > > Signed-off-by: Xiaoyao Li 
> > > Acked-by: Gerd Hoffmann 
> > > ---
> > >   accel/kvm/kvm-all.c  | 12 
> > >   include/sysemu/kvm.h |  1 +
> > >   2 files changed, 13 insertions(+)
> > > 
> > > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> > > index c9f3aab5e587..5071af917ae0 100644
> > > --- a/accel/kvm/kvm-all.c
> > > +++ b/accel/kvm/kvm-all.c
> > > @@ -422,6 +422,11 @@ static int kvm_get_vcpu(KVMState *s, unsigned
> > > long vcpu_id)
> > >   return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
> > >   }
> > > +int __attribute__ ((weak)) kvm_arch_pre_create_vcpu(CPUState *cpu)
> > > +{
> > > +    return 0;
> > > +}
> > 
> > kvm_arch_init_vcpu() is implemented for each arch. Why not use the
> > same approach here?
> 
> Because only x86 needs it currently, for TDX. Other arches don't require an
> implementation.
> 
> If don't provide the _weak_ function, it needs to implement the empty
> function (justing return 0) in all the other arches just as the placeholder.
> If QEMU community prefers this approach, I can change to it in next version.

Alternative is to move the hook to x86 specific function, not common kvm
function. With my quick grepping, x86_cpus_init() or x86_cpu_realizefn().
-- 
Isaku Yamahata 



Re: [PATCH v2 33/58] headers: Add definitions from UEFI spec for volumes, resources, etc...

2023-08-23 Thread Isaku Yamahata
On Fri, Aug 18, 2023 at 05:50:16AM -0400,
Xiaoyao Li  wrote:

> Add UEFI definitions for literals, enums, structs, GUIDs, etc... that
> will be used by TDX to build the UEFI Hand-Off Block (HOB) that is passed
> to the Trusted Domain Virtual Firmware (TDVF).
> 
> All values come from the UEFI specification and TDVF design guide. [1]
> 
> Note, EFI_RESOURCE_MEMORY_UNACCEPTED will be added in future UEFI spec.
> 
> [1] 
> https://software.intel.com/content/dam/develop/external/us/en/documents/tdx-virtual-firmware-design-guide-rev-1.pdf

Nitpick: The specs [1] [2] include unaccepted memory.

[1] UEFI Specification Version 2.10 (released August 2022)
[2] UEFI Platform Initialization Distribution Packaging Specification Version 
1.1)
-- 
Isaku Yamahata 



Re: [PATCH v2 32/58] i386/tdx: Track RAM entries for TDX VM

2023-08-21 Thread Isaku Yamahata
On Fri, Aug 18, 2023 at 05:50:15AM -0400,
Xiaoyao Li  wrote:

> diff --git a/target/i386/kvm/tdx.h b/target/i386/kvm/tdx.h
> index e9d2888162ce..9b3c427766ef 100644
> --- a/target/i386/kvm/tdx.h
> +++ b/target/i386/kvm/tdx.h
> @@ -15,6 +15,17 @@ typedef struct TdxGuestClass {
>  ConfidentialGuestSupportClass parent_class;
>  } TdxGuestClass;
>  
> +enum TdxRamType{
> +TDX_RAM_UNACCEPTED,
> +TDX_RAM_ADDED,
> +};
> +
> +typedef struct TdxRamEntry {
> +uint64_t address;
> +uint64_t length;
> +uint32_t type;

nitpick: enum TdxRamType. and related function arguments.

-- 
Isaku Yamahata 



Re: [PATCH v2 19/58] qom: implement property helper for sha384

2023-08-21 Thread Isaku Yamahata
On Mon, Aug 21, 2023 at 10:25:35AM +0100,
"Daniel P. Berrangé"  wrote:

> On Fri, Aug 18, 2023 at 05:50:02AM -0400, Xiaoyao Li wrote:
> > From: Isaku Yamahata 
> > 
> > Implement property_add_sha384() which converts hex string <-> uint8_t[48]
> > It will be used for TDX which uses sha384 for measurement.
> 
> I think it is likely a better idea to use base64 for the encoding
> the binary hash - we use base64 for all the sev-guest properties
> that were binary data.
> 
> At which points the property set/get logic is much simpler as it
> is just needing a call to  g_base64_encode / g_base64_decode and
> length validation for the decode case.

Hex string is poplar to show hash value, isn't it?  Anyway it's easy for human
operator, shell scripts, libvirt or whatever to convert those representations
with utility commands like base64 or xxd, or library call.  Either way would
work.
-- 
Isaku Yamahata 



Re: [PATCH v2 08/58] i386/tdx: Adjust the supported CPUID based on TDX restrictions

2023-08-21 Thread Isaku Yamahata
On Fri, Aug 18, 2023 at 05:49:51AM -0400,
Xiaoyao Li  wrote:

> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index 56cb826f6125..3198bc9fd5fb 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
...
> +static inline uint32_t host_cpuid_reg(uint32_t function,
> +  uint32_t index, int reg)
> +{
> +uint32_t eax, ebx, ecx, edx;
> +uint32_t ret = 0;
> +
> +host_cpuid(function, index, , , , );
> +
> +switch (reg) {
> +case R_EAX:
> +ret |= eax;
> +break;
> +case R_EBX:
> +ret |= ebx;
> +break;
> +case R_ECX:
> +ret |= ecx;
> +break;
> +case R_EDX:
> +ret |= edx;

Nitpick: "|" isn't needed as we initialize ret = 0 above. Just '='.
-- 
Isaku Yamahata 



Re: [PATCH v2 45/58] i386/tdx: Limit the range size for MapGPA

2023-08-21 Thread Isaku Yamahata
On Fri, Aug 18, 2023 at 05:50:28AM -0400,
Xiaoyao Li  wrote:

> From: Isaku Yamahata 
> 
> If the range for TDG.VP.VMCALL is too large, process the limited
> size and return retry error.  It's bad for VMM to take too long time,
> e.g. second order, with blocking vcpu execution.  It results in too many
> missing timer interrupts.

This patch requires the guest side patch. [1]
Unless with large guest memory, it's unlikely to hit the limit with KVM/qemu,
though.

[1] https://lore.kernel.org/all/20230811021246.821-1-de...@microsoft.com/

> 
> Signed-off-by: Isaku Yamahata 
> Signed-off-by: Xiaoyao Li 
> ---
>  target/i386/kvm/tdx.c | 19 ++-
>  1 file changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index 0c43c1f7759f..ced55be506d1 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -994,12 +994,16 @@ static hwaddr tdx_shared_bit(X86CPU *cpu)
>  return (cpu->phys_bits > 48) ? BIT_ULL(51) : BIT_ULL(47);
>  }
>  
> +/* 64MB at most in one call. What value is appropriate? */
> +#define TDX_MAP_GPA_MAX_LEN (64 * 1024 * 1024)
> +
>  static void tdx_handle_map_gpa(X86CPU *cpu, struct kvm_tdx_vmcall *vmcall)
>  {
>  hwaddr shared_bit = tdx_shared_bit(cpu);
>  hwaddr gpa = vmcall->in_r12 & ~shared_bit;
>  bool private = !(vmcall->in_r12 & shared_bit);
>  hwaddr size = vmcall->in_r13;
> +bool retry = false;
>  int ret = 0;
>  
>  vmcall->status_code = TDG_VP_VMCALL_INVALID_OPERAND;
> @@ -1018,12 +1022,25 @@ static void tdx_handle_map_gpa(X86CPU *cpu, struct 
> kvm_tdx_vmcall *vmcall)
>  return;
>  }
>  
> +if (size > TDX_MAP_GPA_MAX_LEN) {
> +retry = true;
> +size = TDX_MAP_GPA_MAX_LEN;
> +}
> +
>  if (size > 0) {
>  ret = kvm_convert_memory(gpa, size, private);
>  }
>  
>  if (!ret) {
> -vmcall->status_code = TDG_VP_VMCALL_SUCCESS;
> +if (retry) {
> +vmcall->status_code = TDG_VP_VMCALL_RETRY;
> +vmcall->out_r11 = gpa + size;
> +if (!private) {
> +    vmcall->out_r11 |= shared_bit;
> +}
> +} else {
> +vmcall->status_code = TDG_VP_VMCALL_SUCCESS;
> +}
>  }
>  }
>  
> -- 
> 2.34.1
> 
> 

-- 
Isaku Yamahata 



Re: [PATCH v3 2/2] target/i386: Avoid overflow of the cache parameter enumerated by leaf 4

2023-08-17 Thread Isaku Yamahata
On Wed, Aug 16, 2023 at 04:06:58PM +0800,
Qian Wen  wrote:

> According to SDM, CPUID.0x4:EAX[31:26] indicates the Maximum number of
> addressable IDs for processor cores in the physical package. If we
> launch over 64 cores VM, the 6-bit field will overflow, and the wrong
> core_id number will be reported.
> 
> Since the HW reports 0x3f when the intel processor has over 64 cores,
> limit the max value written to EBX[31:26] to 63, so max num_cores should
> be 64.
> 
> Signed-off-by: Qian Wen 
> Reviewed-by: Zhao Liu 
> ---
>  target/i386/cpu.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index 5c008b9d7e..3b6854300a 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -248,7 +248,7 @@ static void encode_cache_cpuid4(CPUCacheInfo *cache,
>  *eax = CACHE_TYPE(cache->type) |
> CACHE_LEVEL(cache->level) |
> (cache->self_init ? CACHE_SELF_INIT_LEVEL : 0) |
> -   ((num_cores - 1) << 26) |
> +   ((MIN(num_cores, 64) - 1) << 26) |
> ((num_apic_ids - 1) << 14);
>  
>  assert(cache->line_size > 0);
> -- 
> 2.25.1
> 
> 

Reviewed-by: Isaku Yamahata 
-- 
Isaku Yamahata 



Re: [PATCH v3 0/2] Fix overflow of the max number of IDs for logic processor and core

2023-08-17 Thread Isaku Yamahata
On Wed, Aug 16, 2023 at 04:06:56PM +0800,
Qian Wen  wrote:

> CPUID.1.EBX[23:16]: Maximum number of addressable IDs for logical
> processors in this physical package.
> CPUID.4:EAX[31:26]: Maximum number of addressable IDs for processor cores
> in the physical package.
> 
> The current qemu code doesn't limit the value written to these two fields.
> If the guest has a huge number of cores, APs (application processor) will
> fail to bring up and the wrong info will be reported.
> According to HW behavior, setting max value written to CPUID.1.EBX[23:16]
> to 255, and CPUID.4:EAX[31:26] to 63.
> 
> ---
> Changes v2 -> v3:
>   - Add patch 2.
>   - Revise the commit message and comment to be clearer.
>   - Using MIN() for limitation.
> Changes v1 -> v2:
>   - Revise the commit message and comment to more clearer.
>   - Rebased to v8.1.0-rc2.
> 
> Qian Wen (2):
>   target/i386: Avoid cpu number overflow in legacy topology
>   target/i386: Avoid overflow of the cache parameter enumerated by leaf 4
> 
>  target/i386/cpu.c | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> base-commit: 0d52116fd82cdd1f4a88837336af5b6290c364a4
> -- 
> 2.25.1
> 

The patch itself looks good. Can we add test cases?
We have some in qemu/tests/unit/test-x86-cpuid.c.
-- 
Isaku Yamahata 



Re: [PATCH v3 1/2] target/i386: Avoid cpu number overflow in legacy topology

2023-08-17 Thread Isaku Yamahata
On Wed, Aug 16, 2023 at 04:06:57PM +0800,
Qian Wen  wrote:

> The legacy topology enumerated by CPUID.1.EBX[23:16] is defined in SDM
> Vol2:
> 
> Bits 23-16: Maximum number of addressable IDs for logical processors in
> this physical package.
> 
> When threads_per_socket > 255, it will 1) overwrite bits[31:24] which is
> apic_id, 2) bits [23:16] get truncated.
> 
> Specifically, if launching the VM with -smp 256, the value written to
> EBX[23:16] is 0 because of data overflow. If the guest only supports
> legacy topology, without V2 Extended Topology enumerated by CPUID.0x1f
> or Extended Topology enumerated by CPUID.0x0b to support over 255 CPUs,
> the return of the kernel invoking cpu_smt_allowed() is false and APs
> (application processors) will fail to bring up. Then only CPU 0 is online,
> and others are offline.
> 
> For example, launch VM via:
> qemu-system-x86_64 -M q35,accel=kvm,kernel-irqchip=split \
> -cpu qemu64,cpuid-0xb=off -smp 256 -m 32G \
> -drive file=guest.img,if=none,id=virtio-disk0,format=raw \
> -device virtio-blk-pci,drive=virtio-disk0,bootindex=1 --nographic
> 
> The guest shows:
> CPU(s):   256
> On-line CPU(s) list:  0
> Off-line CPU(s) list: 1-255
> 
> To avoid this issue caused by overflow, limit the max value written to
> EBX[23:16] to 255 as the HW does.
> 
> Signed-off-by: Qian Wen 
> Reviewed-by: Zhao Liu 
> ---
>  target/i386/cpu.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index 97ad229d8b..5c008b9d7e 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -6008,6 +6008,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
> uint32_t count,
>  uint32_t die_offset;
>  uint32_t limit;
>  uint32_t signature[3];
> +uint32_t threads_per_socket;
>  X86CPUTopoInfo topo_info;
>  
>  topo_info.dies_per_pkg = env->nr_dies;
> @@ -6049,8 +6050,9 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
> uint32_t count,
>  *ecx |= CPUID_EXT_OSXSAVE;
>  }
>  *edx = env->features[FEAT_1_EDX];
> -if (cs->nr_cores * cs->nr_threads > 1) {
> -*ebx |= (cs->nr_cores * cs->nr_threads) << 16;
> +threads_per_socket = cs->nr_cores * cs->nr_threads;
> +    if (threads_per_socket > 1) {
> +*ebx |= MIN(threads_per_socket, 255) << 16;
>  *edx |= CPUID_HT;
>  }
>  if (!cpu->enable_pmu) {
> -- 
> 2.25.1
> 
> 

Reviewed-by: Isaku Yamahata 
-- 
Isaku Yamahata 



Re: [PATCH v2] migration: refactor migration_completion

2023-08-14 Thread Isaku Yamahata
On Fri, Aug 04, 2023 at 05:30:53PM +0800,
Wei Wang  wrote:

> Current migration_completion function is a bit long. Refactor the long
> implementation into different subfunctions:
> - migration_completion_precopy: completion code related to precopy
> - migration_completion_postcopy: completion code related to postcopy
> - close_return_path_on_source: rp thread related cleanup on migration
> completion. It is named to match with open_return_path_on_source.
> 
> This improves readability and is easier for future updates (e.g. add new
> subfunctions when completion code related to new features are needed). No
> functional changes intended.
> 
> Signed-off-by: Wei Wang 
> ---
> Changelog:
> - Merge await_return_path_close_on_source into
>   close_return_path_on_source as the later basically just calls the
>   previous;
> - make migration_completion_postcopy "void" as it doesn't return a
>   value.

Reviewed-by: Isaku Yamahata 
-- 
Isaku Yamahata 



Re: [RFC PATCH 00/19] QEMU gmem implemention

2023-08-14 Thread Isaku Yamahata
ood deal on the
> QEMU side. I think it's still worthwhile to still allow:
> 
>  -object memory-backend-memfd-private,...
> 
> because it provides a nice mechanism to set up a pair of shared/private
> memfd's to enable hole-punching via fallocate() to avoid doubling memory
> allocations for shared/private. It's also a nice place to control
> potentially-configurable things like:
> 
>  - whether or not to enable discard/hole-punching
>  - if discard is enabled, whether or not to register the range via
>RamDiscardManager interface so that VFIO/IOMMU mappings get updated
>when doing PCI passthrough. SNP relies on this for PCI passthrough
>when discard is enabled, otherwise DMA occurs to stale mappings of
>discarded bounce-buffer pages:
> 
>  
> https://github.com/AMDESE/qemu/blob/snp-latest/backends/hostmem-memfd-private.c#L449
> 
> But for other memory ranges, it doesn't do a lot of good to rely on
> users to control those via -object memory-backend-memfd-private, since
> QEMU will set up some regions internally, like the UEFI ROM.
> 
> It also isn't ideal for QEMU itself to internally control what
> should/shouldn't be set up with a backing guest_memfd, because some
> guest kernels do weird stuff, like scan for ROM regions in areas that
> guest kernels might have mapped as encrypted in guest page table. You
> can consider them to be guest bugs, but even current SNP-capable
> kernels exhibit this behavior and if the guest wants to do dumb stuff
> QEMU should let it.
> 
> But for these latter 2 cases, it doesn't make sense to attempt to do
> any sort of discarding of backing pages since it doesn't make sense to
> discard ROM pages.
> 
> So I think it makes sense to just set up the gmemfd automatically across
> the board internally, and keep memory-backend-memfd-private around
> purely as a way to control/configure discardable memory.


I'm looking at the repo and
31a7c7e36684 ("*hostmem-memfd-private: Initial discard manager support")

Do we have to implement RAM_DISCARD_MANGER at memory-backend-memfd-private?
Can't we implement it at host_mem? The interface callbacks can have check
"if (!private) return".  Then we can support any host-mem backend.
-- 
Isaku Yamahata 



Re: [RFC PATCH 06/19] i386/pc: Drop pc_machine_kvm_type()

2023-08-02 Thread Isaku Yamahata
On Mon, Jul 31, 2023 at 12:21:48PM -0400,
Xiaoyao Li  wrote:

> pc_machine_kvm_type() was introduced by commit e21be724eaf5 ("i386/xen:
> add pc_machine_kvm_type to initialize XEN_EMULATE mode") to do Xen
> specific initialization by utilizing kvm_type method.
> 
> commit eeedfe6c6316 ("hw/xen: Simplify emulated Xen platform init")
> moves the Xen specific initialization to pc_basic_device_init().
> 
> There is no need to keep the PC specific kvm_type() implementation
> anymore. On the other hand, later patch will implement kvm_type()
> method for all x86/i386 machines to support KVM_X86_SW_PROTECTED_VM.
> 
> Signed-off-by: Xiaoyao Li 
> ---
>  hw/i386/pc.c | 5 -
>  include/hw/i386/pc.h | 3 ---
>  2 files changed, 8 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 3109d5e0e035..abeadd903827 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1794,11 +1794,6 @@ static void pc_machine_initfn(Object *obj)
>  cxl_machine_init(obj, >cxl_devices_state);
>  }
>  
> -int pc_machine_kvm_type(MachineState *machine, const char *kvm_type)
> -{
> -return 0;
> -}
> -
>  static void pc_machine_reset(MachineState *machine, ShutdownCause reason)
>  {
>  CPUState *cs;
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index d54e8b1101e4..c98d628a76f3 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -296,15 +296,12 @@ extern const size_t pc_compat_1_5_len;
>  extern GlobalProperty pc_compat_1_4[];
>  extern const size_t pc_compat_1_4_len;
>  
> -int pc_machine_kvm_type(MachineState *machine, const char *vm_type);
> -
>  #define DEFINE_PC_MACHINE(suffix, namestr, initfn, optsfn) \
>  static void pc_machine_##suffix##_class_init(ObjectClass *oc, void 
> *data) \
>  { \
>  MachineClass *mc = MACHINE_CLASS(oc); \
>  optsfn(mc); \
>  mc->init = initfn; \
> -mc->kvm_type = pc_machine_kvm_type; \
>  } \
>  static const TypeInfo pc_machine_type_##suffix = { \
>  .name   = namestr TYPE_MACHINE_SUFFIX, \
> -- 
> 2.34.1
> 

It seems strange for MachineClass to have kvm_type(). Probably AccelClass.
(struct KVMAccelClass?)

Anyway this is independent clean up.

Reviewed-by: Isaku Yamahata 
-- 
Isaku Yamahata 



Re: [RFC PATCH 08/19] HostMem: Add private property to indicate to use kvm gmem

2023-08-02 Thread Isaku Yamahata
On Wed, Aug 02, 2023 at 04:14:29PM +0200,
David Hildenbrand  wrote:

> On 02.08.23 10:03, Xiaoyao Li wrote:
> > On 8/2/2023 1:21 AM, David Hildenbrand wrote:
> > > On 31.07.23 18:21, Xiaoyao Li wrote:
> > > > From: Isaku Yamahata 
> > > > 
> > > > Signed-off-by: Isaku Yamahata 
> > > > Signed-off-by: Xiaoyao Li 
> > > > ---
> > > >    backends/hostmem.c   | 18 ++
> > > >    include/sysemu/hostmem.h |  2 +-
> > > >    qapi/qom.json    |  4 
> > > >    3 files changed, 23 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/backends/hostmem.c b/backends/hostmem.c
> > > > index 747e7838c031..dbdbb0aafd45 100644
> > > > --- a/backends/hostmem.c
> > > > +++ b/backends/hostmem.c
> > > > @@ -461,6 +461,20 @@ static void
> > > > host_memory_backend_set_reserve(Object *o, bool value, Error **errp)
> > > >    }
> > > >    backend->reserve = value;
> > > >    }
> > > > +
> > > > +static bool host_memory_backend_get_private(Object *o, Error **errp)
> > > > +{
> > > > +    HostMemoryBackend *backend = MEMORY_BACKEND(o);
> > > > +
> > > > +    return backend->private;
> > > > +}
> > > > +
> > > > +static void host_memory_backend_set_private(Object *o, bool value,
> > > > Error **errp)
> > > > +{
> > > > +    HostMemoryBackend *backend = MEMORY_BACKEND(o);
> > > > +
> > > > +    backend->private = value;
> > > > +}
> > > >    #endif /* CONFIG_LINUX */
> > > >    static bool
> > > > @@ -541,6 +555,10 @@ host_memory_backend_class_init(ObjectClass *oc,
> > > > void *data)
> > > >    host_memory_backend_get_reserve,
> > > > host_memory_backend_set_reserve);
> > > >    object_class_property_set_description(oc, "reserve",
> > > >    "Reserve swap space (or huge pages) if applicable");
> > > > +    object_class_property_add_bool(oc, "private",
> > > > +    host_memory_backend_get_private,
> > > > host_memory_backend_set_private);
> > > > +    object_class_property_set_description(oc, "private",
> > > > +    "Use KVM gmem private memory");
> > > >    #endif /* CONFIG_LINUX */
> > > >    /*
> > > >     * Do not delete/rename option. This option must be considered
> > > > stable
> > > > diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h
> > > > index 39326f1d4f9c..d88970395618 100644
> > > > --- a/include/sysemu/hostmem.h
> > > > +++ b/include/sysemu/hostmem.h
> > > > @@ -65,7 +65,7 @@ struct HostMemoryBackend {
> > > >    /* protected */
> > > >    uint64_t size;
> > > >    bool merge, dump, use_canonical_path;
> > > > -    bool prealloc, is_mapped, share, reserve;
> > > > +    bool prealloc, is_mapped, share, reserve, private;
> > > >    uint32_t prealloc_threads;
> > > >    ThreadContext *prealloc_context;
> > > >    DECLARE_BITMAP(host_nodes, MAX_NODES + 1);
> > > > diff --git a/qapi/qom.json b/qapi/qom.json
> > > > index 7f92ea43e8e1..e0b2044e3d20 100644
> > > > --- a/qapi/qom.json
> > > > +++ b/qapi/qom.json
> > > > @@ -605,6 +605,9 @@
> > > >    # @reserve: if true, reserve swap space (or huge pages) if applicable
> > > >    # (default: true) (since 6.1)
> > > >    #
> > > > +# @private: if true, use KVM gmem private memory
> > > > +#   (default: false) (since 8.1)
> > > > +#
> > > 
> > > But that's not what any of this does.
> > > 
> > > This patch only adds a property and doesn't even explain what it intends
> > > to achieve with that.
> > > 
> > > How will it be used from a user? What will it affect internally? What
> > > will it modify in regards of the memory backend?
> > 
> > How it will be used is in the next patch, patch 09.
> > 
> > for kvm_x86_sw_protected_vm type VM, it will allocate private gmem with
> > KVM ioctl if the memory backend has property "private" on.
> 
> It feels wired up the wrong way.
> 
> When creating/initializing the memory backend, we should also take care of
> allocating the gmem_fd, for example, by doing some gmem allocation callback,
> ideally *internally* creating the RAM memory region / RAMBlock.
> 
> And we should fail if that is impossible (gmem does not apply to the VM) or
> creating the gmem_fd failed for other reason.
> 
> Like passing a RAM_GMEM flag to memory_region_init_ram_flags_nomigrate() in
> ram_backend_memory_alloc(), to then handle it internally, failing if there
> is an error.

KVM gmem is tied to VM. not before creating VM. We have to delay of the
allocation of kvm gmem until VM initialization.

Hmm, one options is to move gmem_fd from RAMBlock to KVMSlot.  Handle the
allocation of KVM gmem (issuing KVM gmem ioctl) there. i.e. in
kvm_set_phys_mem() or kvm_region_add() (or whatever functions of KVM memory
listener).  Maybe we can drop ram_block_convert_range() and can have KVM
specific logic instead.

We still need a way for user to specify which guest memory region is subject
to KVM gmem, though.
-- 
Isaku Yamahata 



Re: [RFC PATCH 19/19] i386: Disable SMM mode for X86_SW_PROTECTED_VM

2023-08-02 Thread Isaku Yamahata
On Mon, Jul 31, 2023 at 12:22:01PM -0400,
Xiaoyao Li  wrote:

> Signed-off-by: Xiaoyao Li 
> ---
>  target/i386/kvm/kvm.c | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index a96640512dbc..62f237068a3a 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -2654,6 +2654,13 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>  
>  if (x86ms->vm_type == KVM_X86_SW_PROTECTED_VM) {
>  memory_listener_register(_x86_sw_protected_vm_memory_listener, 
> _space_memory);
> +
> +if (x86ms->smm == ON_OFF_AUTO_AUTO) {
> +x86ms->smm = ON_OFF_AUTO_OFF;
> +} else if (x86ms->smm == ON_OFF_AUTO_ON) {
> +error_report("X86_SW_PROTECTED_VM doesn't support SMM");
> +return -EINVAL;
> +}
>      }
>  

If we use confidential guest support, this check should go to there.
-- 
Isaku Yamahata 



Re: [RFC PATCH 15/19] kvm: handle KVM_EXIT_MEMORY_FAULT

2023-08-02 Thread Isaku Yamahata
On Mon, Jul 31, 2023 at 12:21:57PM -0400,
Xiaoyao Li  wrote:

> From: Chao Peng 
> 
> Currently only KVM_MEMORY_EXIT_FLAG_PRIVATE in flags is valid when
> KVM_EXIT_MEMORY_FAULT happens. It indicates userspace needs to do
> the memory conversion on the RAMBlock to turn the memory into desired
> attribute, i.e., private/shared.
> 
> Note, KVM_EXIT_MEMORY_FAULT makes sense only when the RAMBlock has
> gmem memory backend.
> 
> Signed-off-by: Chao Peng 
> Signed-off-by: Xiaoyao Li 
> ---
>  accel/kvm/kvm-all.c | 52 +
>  1 file changed, 52 insertions(+)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index f9b5050b8885..72d50b923bf2 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -3040,6 +3040,48 @@ static void kvm_eat_signals(CPUState *cpu)
>  } while (sigismember(, SIG_IPI));
>  }
>  
> +static int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
> +{
> +MemoryRegionSection section;
> +void *addr;
> +RAMBlock *rb;
> +ram_addr_t offset;
> +int ret = -1;
> +
> +section = memory_region_find(get_system_memory(), start, size);
> +if (!section.mr) {
> +return ret;
> +}
> +
> +if (memory_region_can_be_private(section.mr)) {
> +if (to_private) {
> +ret = kvm_set_memory_attributes_private(start, size);
> +} else {
> +ret = kvm_set_memory_attributes_shared(start, size);
> +}
> +
> +if (ret) {
> +return ret;
> +}
> +
> +addr = memory_region_get_ram_ptr(section.mr) +
> +   section.offset_within_region;
> +rb = qemu_ram_block_from_host(addr, false, );

Here we have already section. section.mr->ram_block.  We don't have to
scan the existing RAMBlocks.

Except that, looks good to me.
Reviewed-by: Isaku Yamahata 
-- 
Isaku Yamahata 



Re: [RFC PATCH 00/19] QEMU gmem implemention

2023-07-31 Thread Isaku Yamahata
On Mon, Jul 31, 2023 at 12:21:42PM -0400,
Xiaoyao Li  wrote:

> This is the first RFC version of enabling KVM gmem[1] as the backend for
> private memory of KVM_X86_PROTECTED_VM.
> 
> It adds the support to create a specific KVM_X86_PROTECTED_VM type VM,
> and introduces 'private' property for memory backend. When the vm type
> is KVM_X86_PROTECTED_VM and memory backend has private enabled as below,
> it will call KVM gmem ioctl to allocate private memory for the backend.
> 
> $qemu -object memory-backend-ram,id=mem0,size=1G,private=on \
>   -machine q35,kvm-type=sw-protected-vm,memory-backend=mem0 \
> ...
> 
> Unfortunately this patch series fails the boot of OVMF at very early
> stage due to triple fault because KVM doesn't support emulate string IO
> to private memory. We leave it as an open to be discussed.
> 
> There are following design opens that need to be discussed:
> 
> 1. how to determine the vm type?
> 
>a. like this series, specify the vm type via machine property
>   'kvm-type'
>b. check the memory backend, if any backend has 'private' property
>   set, the vm-type is set to KVM_X86_PROTECTED_VM.

Hi Xiaoyao.  Because qemu has already confidential guest support, we should
utilize it.  Say,
qemu  \
  -object sw-protected, id=swp0,  \
  -machine confidential-guest-support=swp0



> 2. whether 'private' property is needed if we choose 1.b as design 
> 
>with 1.b, QEMU can decide whether the memory region needs to be
>private (allocates gmem fd for it) or not, on its own.


Memory region property (how to create KVM memory slot) should be independent
from underlying VM type.  Some (e.g. TDX) may require KVM private memory slot,
some may not.  Leave the decision to its vm type backend.  They can use qemu
memory listener.
-- 
Isaku Yamahata 



Re: [PATCH v1] migration: refactor migration_completion

2023-07-17 Thread Isaku Yamahata
avevm_state_complete_precopy(s->to_dst_file, false,
> + s->block_inactive);
> +out_unlock:
> +qemu_mutex_unlock_iothread();
> +return ret;
> +}
>  
> -qemu_mutex_lock_iothread();
> -qemu_savevm_state_complete_postcopy(s->to_dst_file);
> -qemu_mutex_unlock_iothread();
> +static int migration_completion_postcopy(MigrationState *s)
> +{
> +trace_migration_completion_postcopy_end();
> +
> +qemu_mutex_lock_iothread();
> +qemu_savevm_state_complete_postcopy(s->to_dst_file);
> +qemu_mutex_unlock_iothread();
> +
> +/*
> + * Shutdown the postcopy fast path thread.  This is only needed when dest
> + * QEMU binary is old (7.1/7.2).  QEMU 8.0+ doesn't need this.
> + */
> +if (migrate_postcopy_preempt() && s->preempt_pre_7_2) {
> +postcopy_preempt_shutdown_file(s);
> +}
> +
> +trace_migration_completion_postcopy_end_after_complete();
> +
> +return 0;

Always return 0?  Make it void.


> +}
>  
> +static void migration_completion_failed(MigrationState *s,
> +int current_active_state)
> +{
> +if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE ||
> +  s->state == MIGRATION_STATUS_DEVICE)) {
>  /*
> - * Shutdown the postcopy fast path thread.  This is only needed
> - * when dest QEMU binary is old (7.1/7.2).  QEMU 8.0+ doesn't need
> - * this.
> + * If not doing postcopy, vm_start() will be called: let's
> + * regain control on images.
>   */
> -if (migrate_postcopy_preempt() && s->preempt_pre_7_2) {
> -postcopy_preempt_shutdown_file(s);
> +Error *local_err = NULL;
> +
> +qemu_mutex_lock_iothread();
> +bdrv_activate_all(_err);
> +if (local_err) {
> +error_report_err(local_err);
> +} else {
> +s->block_inactive = false;
>  }
> +qemu_mutex_unlock_iothread();
> +}
>  
> -trace_migration_completion_postcopy_end_after_complete();
> -} else {
> +migrate_set_state(>state, current_active_state,
> +  MIGRATION_STATUS_FAILED);
> +}
> +
> +/**
> + * migration_completion: Used by migration_thread when there's not much left.
> + *   The caller 'breaks' the loop when this returns.
> + *
> + * @s: Current migration state
> + */
> +static void migration_completion(MigrationState *s)
> +{
> +int ret = -1;
> +int current_active_state = s->state;
> +
> +if (s->state == MIGRATION_STATUS_ACTIVE) {
> +ret = migration_completion_precopy(s, _active_state);
> +} else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
> +ret = migration_completion_postcopy(s);

Here we can set ret = 0.


> +}
> +
> +if (ret < 0) {
>  goto fail;
>  }
>  
> @@ -2357,14 +2413,8 @@ static void migration_completion(MigrationState *s)
>   * it will wait for the destination to send it's status in
>   * a SHUT command).
>   */
> -if (s->rp_state.rp_thread_created) {
> -int rp_error;
> -trace_migration_return_path_end_before();
> -rp_error = await_return_path_close_on_source(s);
> -trace_migration_return_path_end_after(rp_error);
> -if (rp_error) {
> -goto fail;
> -}
> +if (close_return_path_on_source(s) < 0) {
> +goto fail;
>  }
>  
>  if (qemu_file_get_error(s->to_dst_file)) {
> @@ -2384,26 +2434,7 @@ static void migration_completion(MigrationState *s)
>  return;
>  
>  fail:
> -if (s->block_inactive && (s->state == MIGRATION_STATUS_ACTIVE ||
> -  s->state == MIGRATION_STATUS_DEVICE)) {
> -/*
> - * If not doing postcopy, vm_start() will be called: let's
> - * regain control on images.
> - */
> -Error *local_err = NULL;
> -
> -qemu_mutex_lock_iothread();
> -bdrv_activate_all(_err);
> -if (local_err) {
> -error_report_err(local_err);
> -} else {
> -s->block_inactive = false;
> -}
> -qemu_mutex_unlock_iothread();
> -}
> -
> -migrate_set_state(>state, current_active_state,
> -  MIGRATION_STATUS_FAILED);
> +migration_completion_failed(s, current_active_state);
>  }
>  
>  /**
> -- 
> 2.27.0
> 
> 

-- 
Isaku Yamahata 



[PATCH 1/3] exec/memory: Add symbolic value for memory listener priority for accel

2023-06-20 Thread Isaku Yamahata
Add MEMORY_LISTNER_PRIORITY_ACCEL for the symbolic value for the memory
listener to replace the hard-coded value 10 for accel.

No functional change intended.

Signed-off-by: Isaku Yamahata 
---
 accel/hvf/hvf-accel-ops.c   | 2 +-
 accel/kvm/kvm-all.c | 2 +-
 hw/arm/xen_arm.c| 2 +-
 hw/i386/xen/xen-hvm.c   | 2 +-
 hw/xen/xen-hvm-common.c | 2 +-
 hw/xen/xen_pt.c | 4 ++--
 include/exec/memory.h   | 2 ++
 target/i386/hax/hax-mem.c   | 2 +-
 target/i386/nvmm/nvmm-all.c | 2 +-
 target/i386/whpx/whpx-all.c | 2 +-
 10 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/accel/hvf/hvf-accel-ops.c b/accel/hvf/hvf-accel-ops.c
index 9c3da03c948f..c0c51841a615 100644
--- a/accel/hvf/hvf-accel-ops.c
+++ b/accel/hvf/hvf-accel-ops.c
@@ -304,7 +304,7 @@ static void hvf_region_del(MemoryListener *listener,
 
 static MemoryListener hvf_memory_listener = {
 .name = "hvf",
-.priority = 10,
+.priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 .region_add = hvf_region_add,
 .region_del = hvf_region_del,
 .log_start = hvf_log_start,
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 7679f397aec0..36ed4ca246b5 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1775,7 +1775,7 @@ void kvm_memory_listener_register(KVMState *s, 
KVMMemoryListener *kml,
 kml->listener.commit = kvm_region_commit;
 kml->listener.log_start = kvm_log_start;
 kml->listener.log_stop = kvm_log_stop;
-kml->listener.priority = 10;
+kml->listener.priority = MEMORY_LISTENER_PRIORITY_ACCEL;
 kml->listener.name = name;
 
 if (s->kvm_dirty_ring_size) {
diff --git a/hw/arm/xen_arm.c b/hw/arm/xen_arm.c
index 19b1cb81ade9..044093fec75d 100644
--- a/hw/arm/xen_arm.c
+++ b/hw/arm/xen_arm.c
@@ -45,7 +45,7 @@ static MemoryListener xen_memory_listener = {
 .log_sync = NULL,
 .log_global_start = NULL,
 .log_global_stop = NULL,
-.priority = 10,
+.priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 };
 
 struct XenArmState {
diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
index 5dc5e805351c..3da5a2b23f7d 100644
--- a/hw/i386/xen/xen-hvm.c
+++ b/hw/i386/xen/xen-hvm.c
@@ -467,7 +467,7 @@ static MemoryListener xen_memory_listener = {
 .log_sync = xen_log_sync,
 .log_global_start = xen_log_global_start,
 .log_global_stop = xen_log_global_stop,
-.priority = 10,
+.priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 };
 
 static void regs_to_cpu(vmware_regs_t *vmport_regs, ioreq_t *req)
diff --git a/hw/xen/xen-hvm-common.c b/hw/xen/xen-hvm-common.c
index 42339c96bdba..886c3ee944d3 100644
--- a/hw/xen/xen-hvm-common.c
+++ b/hw/xen/xen-hvm-common.c
@@ -155,7 +155,7 @@ MemoryListener xen_io_listener = {
 .name = "xen-io",
 .region_add = xen_io_add,
 .region_del = xen_io_del,
-.priority = 10,
+.priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 };
 
 DeviceListener xen_device_listener = {
diff --git a/hw/xen/xen_pt.c b/hw/xen/xen_pt.c
index a5401496399b..36e6f93c372f 100644
--- a/hw/xen/xen_pt.c
+++ b/hw/xen/xen_pt.c
@@ -691,14 +691,14 @@ static const MemoryListener xen_pt_memory_listener = {
 .name = "xen-pt-mem",
 .region_add = xen_pt_region_add,
 .region_del = xen_pt_region_del,
-.priority = 10,
+.priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 };
 
 static const MemoryListener xen_pt_io_listener = {
 .name = "xen-pt-io",
 .region_add = xen_pt_io_region_add,
 .region_del = xen_pt_io_region_del,
-.priority = 10,
+.priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 };
 
 /* destroy. */
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 47c2e0221c35..6d95d5917544 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -811,6 +811,8 @@ struct IOMMUMemoryRegion {
 #define IOMMU_NOTIFIER_FOREACH(n, mr) \
 QLIST_FOREACH((n), &(mr)->iommu_notify, node)
 
+#define MEMORY_LISTENER_PRIORITY_ACCEL  10
+
 /**
  * struct MemoryListener: callbacks structure for updates to the physical 
memory map
  *
diff --git a/target/i386/hax/hax-mem.c b/target/i386/hax/hax-mem.c
index 05dbe8cce3ae..bb5ffbc9ac4f 100644
--- a/target/i386/hax/hax-mem.c
+++ b/target/i386/hax/hax-mem.c
@@ -291,7 +291,7 @@ static MemoryListener hax_memory_listener = {
 .region_add = hax_region_add,
 .region_del = hax_region_del,
 .log_sync = hax_log_sync,
-.priority = 10,
+.priority = MEMORY_LISTENER_PRIORITY_ACCEL,
 };
 
 static void hax_ram_block_added(RAMBlockNotifier *n, void *host, size_t size,
diff --git a/target/i386/nvmm/nvmm-all.c b/target/i386/nvmm/nvmm-all.c
index b75738ee9cdf..19d2f7ef09a6 100644
--- a/target/i386/nvmm/nvmm-all.c
+++ b/target/i386/nvmm/nvmm-all.c
@@ -1138,7 +1138,7 @@ static MemoryListener nvmm_memory_listener = {
 .region_add = nvmm_region_add,
 .region_del = nvmm_region_del,
 .log_sync = nvmm_log_sync,
-.priority = 10,
+.priority = MEMORY_LISTENER_PRIORITY_ACCEL,

[PATCH 2/3] exec/memory: Add symbol for memory listener priority for dev backend

2023-06-20 Thread Isaku Yamahata
Add MEMORY_LISTNER_PRIORITY_DEV_BAKCNED for the symbolic value for memory
listener to replace the hard-coded value 10 for the device backend.

No functional change intended.

Signed-off-by: Isaku Yamahata 
---
 accel/kvm/kvm-all.c   | 2 +-
 hw/remote/proxy-memory-listener.c | 2 +-
 hw/virtio/vhost.c | 2 +-
 include/exec/memory.h | 1 +
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 36ed4ca246b5..ae6ecf8326d1 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1800,7 +1800,7 @@ static MemoryListener kvm_io_listener = {
 .name = "kvm-io",
 .eventfd_add = kvm_io_ioeventfd_add,
 .eventfd_del = kvm_io_ioeventfd_del,
-.priority = 10,
+.priority = MEMORY_LISTENER_PRIORITY_DEV_BAKCNED,
 };
 
 int kvm_set_irq(KVMState *s, int irq, int level)
diff --git a/hw/remote/proxy-memory-listener.c 
b/hw/remote/proxy-memory-listener.c
index 18d96a1d04dc..a7f53a0ba464 100644
--- a/hw/remote/proxy-memory-listener.c
+++ b/hw/remote/proxy-memory-listener.c
@@ -217,7 +217,7 @@ void proxy_memory_listener_configure(ProxyMemoryListener 
*proxy_listener,
 proxy_listener->listener.commit = proxy_memory_listener_commit;
 proxy_listener->listener.region_add = proxy_memory_listener_region_addnop;
 proxy_listener->listener.region_nop = proxy_memory_listener_region_addnop;
-proxy_listener->listener.priority = 10;
+proxy_listener->listener.priority = MEMORY_LISTENER_PRIORITY_DEV_BAKCNED;
 proxy_listener->listener.name = "proxy";
 
 memory_listener_register(_listener->listener,
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 23da579ce290..75f7418369cb 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1445,7 +1445,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
 .log_sync = vhost_log_sync,
 .log_global_start = vhost_log_global_start,
 .log_global_stop = vhost_log_global_stop,
-.priority = 10
+.priority = MEMORY_LISTENER_PRIORITY_DEV_BAKCNED
 };
 
 hdev->iommu_listener = (MemoryListener) {
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 6d95d5917544..5c9e04bf1208 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -812,6 +812,7 @@ struct IOMMUMemoryRegion {
 QLIST_FOREACH((n), &(mr)->iommu_notify, node)
 
 #define MEMORY_LISTENER_PRIORITY_ACCEL  10
+#define MEMORY_LISTENER_PRIORITY_DEV_BAKCNED10
 
 /**
  * struct MemoryListener: callbacks structure for updates to the physical 
memory map
-- 
2.25.1




[PATCH 3/3] exec/memory: Add symbol for the min value of memory listener priority

2023-06-20 Thread Isaku Yamahata
Add MEMORY_LISTNER_PRIORITY_MIN for the symbolic value for the min value of
the memory listener instead of the hard-coded magic value 0.  Add explicit
initialization.

No functional change intended.

Signed-off-by: Isaku Yamahata 
---
 accel/kvm/kvm-all.c   | 1 +
 include/exec/memory.h | 1 +
 target/arm/kvm.c  | 1 +
 3 files changed, 3 insertions(+)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index ae6ecf8326d1..026859a59cd7 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1103,6 +1103,7 @@ static MemoryListener kvm_coalesced_pio_listener = {
 .name = "kvm-coalesced-pio",
 .coalesced_io_add = kvm_coalesce_pio_add,
 .coalesced_io_del = kvm_coalesce_pio_del,
+.priority = MEMORY_LISTENER_PRIORITY_MIN,
 };
 
 int kvm_check_extension(KVMState *s, unsigned int extension)
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 5c9e04bf1208..dc6daa8364e5 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -811,6 +811,7 @@ struct IOMMUMemoryRegion {
 #define IOMMU_NOTIFIER_FOREACH(n, mr) \
 QLIST_FOREACH((n), &(mr)->iommu_notify, node)
 
+#define MEMORY_LISTENER_PRIORITY_MIN0
 #define MEMORY_LISTENER_PRIORITY_ACCEL  10
 #define MEMORY_LISTENER_PRIORITY_DEV_BAKCNED10
 
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index 84da49332c4b..14fbf786897d 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -341,6 +341,7 @@ static MemoryListener devlistener = {
 .name = "kvm-arm",
 .region_add = kvm_arm_devlistener_add,
 .region_del = kvm_arm_devlistener_del,
+.priority =MEMORY_LISTENER_PRIORITY_MIN,
 };
 
 static void kvm_arm_set_device_addr(KVMDevice *kd)
-- 
2.25.1




[PATCH 0/3] Add symbols for memory listener priority

2023-06-20 Thread Isaku Yamahata
The hard-coded value, 10, is used for memory_listener_register().  Add symbolic
values for priority of struct MemoryLister.  Replace those hard-coded values
with symbols.

The background is KVM guest memory[1] or TDX support.  I'd like to add one more
memory listener whose priority is higher than the KVM memory listener.  And I
don't want to hard-code 10 + 1.

[1] KVM gmem patches
https://github.com/sean-jc/linux/tree/x86/kvm_gmem_solo

Isaku Yamahata (3):
  exec/memory: Add symbolic value for memory listener priority for accel
  exec/memory: Add symbol for memory listener priority for dev backend
  exec/memory: Add symbol for the min value of memory listener priority

 accel/hvf/hvf-accel-ops.c | 2 +-
 accel/kvm/kvm-all.c   | 5 +++--
 hw/arm/xen_arm.c  | 2 +-
 hw/i386/xen/xen-hvm.c | 2 +-
 hw/remote/proxy-memory-listener.c | 2 +-
 hw/virtio/vhost.c | 2 +-
 hw/xen/xen-hvm-common.c   | 2 +-
 hw/xen/xen_pt.c   | 4 ++--
 include/exec/memory.h | 4 
 target/arm/kvm.c  | 1 +
 target/i386/hax/hax-mem.c | 2 +-
 target/i386/nvmm/nvmm-all.c   | 2 +-
 target/i386/whpx/whpx-all.c   | 2 +-
 13 files changed, 19 insertions(+), 13 deletions(-)


base-commit: cab35c73be9d579db105ef73fa8a60728a890098
-- 
2.25.1




Re: [RFC PATCH 00/19] hugetlb support for KVM guest_mem

2023-06-07 Thread Isaku Yamahata
 and expose hugetlbfs_zero_partial_page
>   mm: hugetlb: Expose remove_inode_hugepages
>   mm: hugetlb: Decouple hstate, subpool from inode
>   mm: hugetlb: Allow alloc_hugetlb_folio() to be parametrized by subpool
> and hstate
>   mm: hugetlb: Provide hugetlb_filemap_add_folio()
>   mm: hugetlb: Refactor vma_*_reservation functions
>   mm: hugetlb: Refactor restore_reserve_on_error
>   mm: hugetlb: Use restore_reserve_on_error directly in filesystems
>   mm: hugetlb: Parametrize alloc_hugetlb_folio_from_subpool() by
> resv_map
>   mm: hugetlb: Parametrize hugetlb functions by resv_map
>   mm: truncate: Expose preparation steps for truncate_inode_pages_final
>   KVM: guest_mem: Refactor kvm_gmem fd creation to be in layers
>   KVM: guest_mem: Refactor cleanup to separate inode and file cleanup
>   KVM: guest_mem: hugetlb: initialization and cleanup
>   KVM: guest_mem: hugetlb: allocate and truncate from hugetlb
>   KVM: selftests: Add basic selftests for hugetlbfs-backed guest_mem
>   KVM: selftests: Support various types of backing sources for private
> memory
>   KVM: selftests: Update test for various private memory backing source
> types
> 
>  fs/hugetlbfs/inode.c  | 102 ++--
>  include/linux/hugetlb.h   |  86 ++-
>  include/linux/mm.h|   1 +
>  include/uapi/linux/kvm.h  |  25 +
>  mm/hugetlb.c  | 324 +++-
>  mm/truncate.c |  24 +-
>  .../testing/selftests/kvm/guest_memfd_test.c  |  33 +-
>  .../testing/selftests/kvm/include/test_util.h |  14 +
>  tools/testing/selftests/kvm/lib/test_util.c   |  74 +++
>  .../kvm/x86_64/private_mem_conversions_test.c |  38 +-
>  virt/kvm/guest_mem.c  | 488 ++
>  11 files changed, 882 insertions(+), 327 deletions(-)
> 
> --
> 2.41.0.rc0.172.g3f132b7071-goog

-- 
Isaku Yamahata 



Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE

2023-03-22 Thread Isaku Yamahata
On Wed, Mar 08, 2023 at 03:40:26PM +0800,
Chao Peng  wrote:

> On Wed, Mar 08, 2023 at 12:13:24AM +, Ackerley Tng wrote:
> > Chao Peng  writes:
> > 
> > > On Sat, Jan 14, 2023 at 12:01:01AM +, Sean Christopherson wrote:
> > > > On Fri, Dec 02, 2022, Chao Peng wrote:
> > > ...
> > > > Strongly prefer to use similar logic to existing code that detects 
> > > > wraps:
> > 
> > > > mem->restricted_offset + mem->memory_size < 
> > > > mem->restricted_offset
> > 
> > > > This is also where I'd like to add the "gfn is aligned to offset"
> > > > check, though
> > > > my brain is too fried to figure that out right now.
> > 
> > > Used count_trailing_zeros() for this TODO, unsure we have other better
> > > approach.
> > 
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index afc8c26fa652..fd34c5f7cd2f 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -56,6 +56,7 @@
> > >   #include 
> > >   #include 
> > >   #include 
> > > +#include 
> > 
> > >   #include "coalesced_mmio.h"
> > >   #include "async_pf.h"
> > > @@ -2087,6 +2088,19 @@ static bool kvm_check_memslot_overlap(struct
> > > kvm_memslots *slots, int id,
> > >   return false;
> > >   }
> > 
> > > +/*
> > > + * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
> > > + */
> > > +static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
> > > +{
> > > + if (!offset)
> > > + return true;
> > > + if (!gpa)
> > > + return false;
> > > +
> > > + return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));

This check doesn't work expected. For example, offset = 2GB, gpa=4GB
this check fails.
I come up with the following.

>From ec87e25082f0497431b732702fae82c6a05071bf Mon Sep 17 00:00:00 2001
Message-Id: 

From: Isaku Yamahata 
Date: Wed, 22 Mar 2023 15:32:56 -0700
Subject: [PATCH] KVM: Relax alignment check for restricted mem

kvm_check_rmem_offset_alignment() only checks based on offset alignment
and GPA alignment.  However, the actual alignment for offset depends
on architecture.  For x86 case, it can be 1G, 2M or 4K.  So even if
GPA is aligned for 1G+, only 1G-alignment is required for offset.

Without this patch, gpa=4G, offset=2G results in failure of memory slot
creation.

Fixes: edc8814b2c77 ("KVM: Require gfn be aligned with restricted offset")
Signed-off-by: Isaku Yamahata 
---
 arch/x86/include/asm/kvm_host.h | 15 +++
 virt/kvm/kvm_main.c |  9 -
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 88e11dd3afde..03af44650f24 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -143,6 +144,20 @@
 #define KVM_HPAGE_MASK(x)  (~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
 
+#define kvm_arch_required_alignmentkvm_arch_required_alignment
+static inline int kvm_arch_required_alignment(u64 gpa)
+{
+   int zeros = count_trailing_zeros(gpa);
+
+   WARN_ON_ONCE(!PAGE_ALIGNED(gpa));
+   if (zeros >= KVM_HPAGE_SHIFT(PG_LEVEL_1G))
+   return KVM_HPAGE_SHIFT(PG_LEVEL_1G);
+   else if (zeros >= KVM_HPAGE_SHIFT(PG_LEVEL_2M))
+   return KVM_HPAGE_SHIFT(PG_LEVEL_2M);
+
+   return PAGE_SHIFT;
+}
+
 #define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50
 #define KVM_MIN_ALLOC_MMU_PAGES 64UL
 #define KVM_MMU_HASH_SHIFT 12
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c9c4eef457b0..f4ff96171d24 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2113,6 +2113,13 @@ static bool kvm_check_memslot_overlap(struct 
kvm_memslots *slots, int id,
return false;
 }
 
+#ifndef kvm_arch_required_alignment
+__weak int kvm_arch_required_alignment(u64 gpa)
+{
+   return PAGE_SHIFT
+}
+#endif
+
 /*
  * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
  */
@@ -2123,7 +2130,7 @@ static bool kvm_check_rmem_offset_alignment(u64 offset, 
u64 gpa)
if (!gpa)
return false;
 
-   return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));
+   return !!(count_trailing_zeros(offset) >= 
kvm_arch_required_alignment(gpa));
 }
 
 /*
-- 
2.25.1



-- 
Isaku Yamahata 



Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes

2023-02-13 Thread Isaku Yamahata
On Fri, Feb 10, 2023 at 12:35:30AM +,
Sean Christopherson  wrote:

> On Wed, Feb 08, 2023, Isaku Yamahata wrote:
> > On Fri, Dec 02, 2022 at 02:13:40PM +0800,
> > Chao Peng  wrote:
> > 
> > > +static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> > > +struct kvm_memory_attributes *attrs)
> > > +{
> > > + gfn_t start, end;
> > > + unsigned long i;
> > > + void *entry;
> > > + u64 supported_attrs = kvm_supported_mem_attributes(kvm);
> > > +
> > > + /* flags is currently not used. */
> > > + if (attrs->flags)
> > > + return -EINVAL;
> > > + if (attrs->attributes & ~supported_attrs)
> > > + return -EINVAL;
> > > + if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
> > > + return -EINVAL;
> > > + if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
> > > + return -EINVAL;
> > > +
> > > + start = attrs->address >> PAGE_SHIFT;
> > > + end = (attrs->address + attrs->size - 1 + PAGE_SIZE) >> PAGE_SHIFT;
> > > +
> > > + entry = attrs->attributes ? xa_mk_value(attrs->attributes) : NULL;
> > > +
> > > + mutex_lock(>lock);
> > > + for (i = start; i < end; i++)
> > > + if (xa_err(xa_store(>mem_attr_array, i, entry,
> > > + GFP_KERNEL_ACCOUNT)))
> > > + break;
> > > + mutex_unlock(>lock);
> > > +
> > > + attrs->address = i << PAGE_SHIFT;
> > > + attrs->size = (end - i) << PAGE_SHIFT;
> > > +
> > > + return 0;
> > > +}
> > > +#endif /* CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES */
> > > +
> > 
> > If memslot isn't private, it should return error if private attribute is 
> > set.
> 
> Why?  I'd rather keep the two things separate.  If we enforce this sort of 
> thing
> at KVM_SET_MEMORY_ATTRIBUTES, then we also have to enforce it at
> KVM_SET_USER_MEMORY_REGION.

For device assignment via shared GPA, non-private memory slot needs to be
allowed.
-- 
Isaku Yamahata 



Re: [PATCH v10 2/9] KVM: Introduce per-page memory attributes

2023-02-08 Thread Isaku Yamahata
On Fri, Dec 02, 2022 at 02:13:40PM +0800,
Chao Peng  wrote:

> +static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> +struct kvm_memory_attributes *attrs)
> +{
> + gfn_t start, end;
> + unsigned long i;
> + void *entry;
> + u64 supported_attrs = kvm_supported_mem_attributes(kvm);
> +
> + /* flags is currently not used. */
> + if (attrs->flags)
> + return -EINVAL;
> + if (attrs->attributes & ~supported_attrs)
> + return -EINVAL;
> + if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
> + return -EINVAL;
> + if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
> + return -EINVAL;
> +
> + start = attrs->address >> PAGE_SHIFT;
> + end = (attrs->address + attrs->size - 1 + PAGE_SIZE) >> PAGE_SHIFT;
> +
> + entry = attrs->attributes ? xa_mk_value(attrs->attributes) : NULL;
> +
> + mutex_lock(>lock);
> + for (i = start; i < end; i++)
> + if (xa_err(xa_store(>mem_attr_array, i, entry,
> + GFP_KERNEL_ACCOUNT)))
> + break;
> + mutex_unlock(>lock);
> +
> + attrs->address = i << PAGE_SHIFT;
> + attrs->size = (end - i) << PAGE_SHIFT;
> +
> + return 0;
> +}
> +#endif /* CONFIG_HAVE_KVM_MEMORY_ATTRIBUTES */
> +

If memslot isn't private, it should return error if private attribute is set.
Something like following check is needed.

+   if (attrs->flags & KVM_MEM_PRIVATE) {
+   /* non-private memory slot doesn't allow KVM_MEM_PRIVATE */
+   for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+   struct kvm_memslot_iter iter;
+   struct kvm_memslots *slots;
+
+   slots = __kvm_memslots(kvm, i);
+   kvm_for_each_memslot_in_gfn_range(, slots, start, 
end) {
+   if (!kvm_slot_can_be_private(iter.slot)) {
+   mutex_unlock(>slots_lock);
+   return -EINVAL;
+   }
+   }
+   }
+   }
+


-- 
Isaku Yamahata 



Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM

2023-02-08 Thread Isaku Yamahata
On Tue, Jan 24, 2023 at 01:27:50AM +,
Sean Christopherson  wrote:

> On Thu, Jan 19, 2023, Isaku Yamahata wrote:
> > On Thu, Jan 19, 2023 at 03:25:08PM +,
> > Sean Christopherson  wrote:
> > 
> > > On Thu, Jan 19, 2023, Isaku Yamahata wrote:
> > > > On Sat, Jan 14, 2023 at 12:37:59AM +,
> > > > Sean Christopherson  wrote:
> > > > 
> > > > > On Fri, Dec 02, 2022, Chao Peng wrote:
> > > > > > This patch series implements KVM guest private memory for 
> > > > > > confidential
> > > > > > computing scenarios like Intel TDX[1]. If a TDX host accesses
> > > > > > TDX-protected guest memory, machine check can happen which can 
> > > > > > further
> > > > > > crash the running host system, this is terrible for multi-tenant
> > > > > > configurations. The host accesses include those from KVM userspace 
> > > > > > like
> > > > > > QEMU. This series addresses KVM userspace induced crash by 
> > > > > > introducing
> > > > > > new mm and KVM interfaces so KVM userspace can still manage guest 
> > > > > > memory
> > > > > > via a fd-based approach, but it can never access the guest memory
> > > > > > content.
> > > > > > 
> > > > > > The patch series touches both core mm and KVM code. I appreciate
> > > > > > Andrew/Hugh and Paolo/Sean can review and pick these patches. Any 
> > > > > > other
> > > > > > reviews are always welcome.
> > > > > >   - 01: mm change, target for mm tree
> > > > > >   - 02-09: KVM change, target for KVM tree
> > > > > 
> > > > > A version with all of my feedback, plus reworked versions of Vishal's 
> > > > > selftest,
> > > > > is available here:
> > > > > 
> > > > >   g...@github.com:sean-jc/linux.git x86/upm_base_support
> > > > > 
> > > > > It compiles and passes the selftest, but it's otherwise barely 
> > > > > tested.  There are
> > > > > a few todos (2 I think?) and many of the commits need changelogs, 
> > > > > i.e. it's still
> > > > > a WIP.
> > > > > 
> > > > > As for next steps, can you (handwaving all of the TDX folks) take a 
> > > > > look at what
> > > > > I pushed and see if there's anything horrifically broken, and that it 
> > > > > still works
> > > > > for TDX?
> > > > > 
> > > > > Fuad (and pKVM folks) same ask for you with respect to pKVM.  
> > > > > Absolutely no rush
> > > > > (and I mean that).
> > > > > 
> > > > > On my side, the two things on my mind are (a) tests and (b) 
> > > > > downstream dependencies
> > > > > (SEV and TDX).  For tests, I want to build a lists of tests that are 
> > > > > required for
> > > > > merging so that the criteria for merging are clear, and so that if 
> > > > > the list is large
> > > > > (haven't thought much yet), the work of writing and running tests can 
> > > > > be distributed.
> > > > > 
> > > > > Regarding downstream dependencies, before this lands, I want to pull 
> > > > > in all the
> > > > > TDX and SNP series and see how everything fits together.  
> > > > > Specifically, I want to
> > > > > make sure that we don't end up with a uAPI that necessitates ugly 
> > > > > code, and that we
> > > > > don't miss an opportunity to make things simpler.  The patches in the 
> > > > > SNP series to
> > > > > add "legacy" SEV support for UPM in particular made me slightly 
> > > > > rethink some minor
> > > > > details.  Nothing remotely major, but something that needs attention 
> > > > > since it'll
> > > > > be uAPI.
> > > > 
> > > > Although I'm still debuging with TDX KVM, I needed the following.
> > > > kvm_faultin_pfn() is called without mmu_lock held.  the race to change
> > > > private/shared is handled by mmu_seq.  Maybe dedicated function only for
> > > > kvm_faultin_pfn().
> > > 
> > > Gah, you're not on the other thread where this was discussed[*].  Simply 
> > > deleting
> > > the lockdep assertion is safe, for guest types that rely on the 
> > > attributes to
> > > define shared vs. private, KVM rechecks the attributes under the 
> > > protection of
> > > mmu_seq.
> > > 
> > > I'll get a fixed version pushed out today.
> > > 
> > > [*] https://lore.kernel.org/all/y8gpl+lwsusgb...@google.com
> > 
> > Now I have tdx kvm working. I've uploaded at the followings.
> > It's rebased to v6.2-rc3.
> > g...@github.com:yamahata/linux.git tdx/upm
> > g...@github.com:yamahata/qemu.git tdx/upm
> 
> And I finally got a working, building version updated and pushed out (again 
> to):
> 
>   g...@github.com:sean-jc/linux.git x86/upm_base_support
> 

Ok, I rebased TDX part to the updated branch.
g...@github.com:yamahata/linux.git tdx/upm
g...@github.com:yamahata/qemu.git tdx/upm

Now it's v6.2-rc7 based.
qemu needs more patches to avoid registering memory slot for SMM. 
-- 
Isaku Yamahata 



Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM

2023-01-19 Thread Isaku Yamahata
On Thu, Jan 19, 2023 at 03:25:08PM +,
Sean Christopherson  wrote:

> On Thu, Jan 19, 2023, Isaku Yamahata wrote:
> > On Sat, Jan 14, 2023 at 12:37:59AM +,
> > Sean Christopherson  wrote:
> > 
> > > On Fri, Dec 02, 2022, Chao Peng wrote:
> > > > This patch series implements KVM guest private memory for confidential
> > > > computing scenarios like Intel TDX[1]. If a TDX host accesses
> > > > TDX-protected guest memory, machine check can happen which can further
> > > > crash the running host system, this is terrible for multi-tenant
> > > > configurations. The host accesses include those from KVM userspace like
> > > > QEMU. This series addresses KVM userspace induced crash by introducing
> > > > new mm and KVM interfaces so KVM userspace can still manage guest memory
> > > > via a fd-based approach, but it can never access the guest memory
> > > > content.
> > > > 
> > > > The patch series touches both core mm and KVM code. I appreciate
> > > > Andrew/Hugh and Paolo/Sean can review and pick these patches. Any other
> > > > reviews are always welcome.
> > > >   - 01: mm change, target for mm tree
> > > >   - 02-09: KVM change, target for KVM tree
> > > 
> > > A version with all of my feedback, plus reworked versions of Vishal's 
> > > selftest,
> > > is available here:
> > > 
> > >   g...@github.com:sean-jc/linux.git x86/upm_base_support
> > > 
> > > It compiles and passes the selftest, but it's otherwise barely tested.  
> > > There are
> > > a few todos (2 I think?) and many of the commits need changelogs, i.e. 
> > > it's still
> > > a WIP.
> > > 
> > > As for next steps, can you (handwaving all of the TDX folks) take a look 
> > > at what
> > > I pushed and see if there's anything horrifically broken, and that it 
> > > still works
> > > for TDX?
> > > 
> > > Fuad (and pKVM folks) same ask for you with respect to pKVM.  Absolutely 
> > > no rush
> > > (and I mean that).
> > > 
> > > On my side, the two things on my mind are (a) tests and (b) downstream 
> > > dependencies
> > > (SEV and TDX).  For tests, I want to build a lists of tests that are 
> > > required for
> > > merging so that the criteria for merging are clear, and so that if the 
> > > list is large
> > > (haven't thought much yet), the work of writing and running tests can be 
> > > distributed.
> > > 
> > > Regarding downstream dependencies, before this lands, I want to pull in 
> > > all the
> > > TDX and SNP series and see how everything fits together.  Specifically, I 
> > > want to
> > > make sure that we don't end up with a uAPI that necessitates ugly code, 
> > > and that we
> > > don't miss an opportunity to make things simpler.  The patches in the SNP 
> > > series to
> > > add "legacy" SEV support for UPM in particular made me slightly rethink 
> > > some minor
> > > details.  Nothing remotely major, but something that needs attention 
> > > since it'll
> > > be uAPI.
> > 
> > Although I'm still debuging with TDX KVM, I needed the following.
> > kvm_faultin_pfn() is called without mmu_lock held.  the race to change
> > private/shared is handled by mmu_seq.  Maybe dedicated function only for
> > kvm_faultin_pfn().
> 
> Gah, you're not on the other thread where this was discussed[*].  Simply 
> deleting
> the lockdep assertion is safe, for guest types that rely on the attributes to
> define shared vs. private, KVM rechecks the attributes under the protection of
> mmu_seq.
> 
> I'll get a fixed version pushed out today.
> 
> [*] https://lore.kernel.org/all/y8gpl+lwsusgb...@google.com

Now I have tdx kvm working. I've uploaded at the followings.
It's rebased to v6.2-rc3.
g...@github.com:yamahata/linux.git tdx/upm
g...@github.com:yamahata/qemu.git tdx/upm

kvm_mmu_do_page_fault() needs the following change.
kvm_mem_is_private() queries mem_attr_array.  kvm_faultin_pfn() also uses
kvm_mem_is_private(). So the shared-private check in kvm_faultin_pfn() doesn't
make sense. This change would belong to TDX KVM patches, though.

diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 72b0da8e27e0..f45ac438bbf4 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -430,7 +430,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu 
*vcpu, gpa_t cr2_or_gpa,
.max_level = vcpu->kvm->arch.tdp_max_page_level,
.req_level = PG_LEVEL_4K,
.goal_level = PG_LEVEL_4K,
-   .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> 
PAGE_SHIFT),
+   .is_private = kvm_is_private_gpa(vcpu->kvm, cr2_or_gpa),
};
int r;


-- 
Isaku Yamahata 



Re: [PATCH v10 0/9] KVM: mm: fd-based approach for supporting KVM

2023-01-19 Thread Isaku Yamahata
On Sat, Jan 14, 2023 at 12:37:59AM +,
Sean Christopherson  wrote:

> On Fri, Dec 02, 2022, Chao Peng wrote:
> > This patch series implements KVM guest private memory for confidential
> > computing scenarios like Intel TDX[1]. If a TDX host accesses
> > TDX-protected guest memory, machine check can happen which can further
> > crash the running host system, this is terrible for multi-tenant
> > configurations. The host accesses include those from KVM userspace like
> > QEMU. This series addresses KVM userspace induced crash by introducing
> > new mm and KVM interfaces so KVM userspace can still manage guest memory
> > via a fd-based approach, but it can never access the guest memory
> > content.
> > 
> > The patch series touches both core mm and KVM code. I appreciate
> > Andrew/Hugh and Paolo/Sean can review and pick these patches. Any other
> > reviews are always welcome.
> >   - 01: mm change, target for mm tree
> >   - 02-09: KVM change, target for KVM tree
> 
> A version with all of my feedback, plus reworked versions of Vishal's 
> selftest,
> is available here:
> 
>   g...@github.com:sean-jc/linux.git x86/upm_base_support
> 
> It compiles and passes the selftest, but it's otherwise barely tested.  There 
> are
> a few todos (2 I think?) and many of the commits need changelogs, i.e. it's 
> still
> a WIP.
> 
> As for next steps, can you (handwaving all of the TDX folks) take a look at 
> what
> I pushed and see if there's anything horrifically broken, and that it still 
> works
> for TDX?
> 
> Fuad (and pKVM folks) same ask for you with respect to pKVM.  Absolutely no 
> rush
> (and I mean that).
> 
> On my side, the two things on my mind are (a) tests and (b) downstream 
> dependencies
> (SEV and TDX).  For tests, I want to build a lists of tests that are required 
> for
> merging so that the criteria for merging are clear, and so that if the list 
> is large
> (haven't thought much yet), the work of writing and running tests can be 
> distributed.
> 
> Regarding downstream dependencies, before this lands, I want to pull in all 
> the
> TDX and SNP series and see how everything fits together.  Specifically, I 
> want to
> make sure that we don't end up with a uAPI that necessitates ugly code, and 
> that we
> don't miss an opportunity to make things simpler.  The patches in the SNP 
> series to
> add "legacy" SEV support for UPM in particular made me slightly rethink some 
> minor
> details.  Nothing remotely major, but something that needs attention since 
> it'll
> be uAPI.

Although I'm still debuging with TDX KVM, I needed the following.
kvm_faultin_pfn() is called without mmu_lock held.  the race to change
private/shared is handled by mmu_seq.  Maybe dedicated function only for
kvm_faultin_pfn().

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 02be5e1cba1e..38699ca75ab8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2322,7 +2322,7 @@ static inline void kvm_account_pgtable_pages(void *virt, 
int nr)
 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
 static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t 
gfn)
 {
-   lockdep_assert_held(>mmu_lock);
+   // lockdep_assert_held(>mmu_lock);
 
return xa_to_value(xa_load(>mem_attr_array, gfn));
 }


-- 
Isaku Yamahata 



Re: [PATCH v10 1/9] mm: Introduce memfd_restricted system call to create restricted user memory

2023-01-18 Thread Isaku Yamahata
On Wed, Jan 18, 2023 at 04:16:41PM +0800,
Chao Peng  wrote:

> On Tue, Jan 17, 2023 at 04:34:15PM +, Sean Christopherson wrote:
> > On Tue, Jan 17, 2023, Chao Peng wrote:
> > > On Fri, Jan 13, 2023 at 09:54:41PM +, Sean Christopherson wrote:
> > > > > + list_for_each_entry(notifier, >notifiers, list) {
> > > > > + notifier->ops->invalidate_start(notifier, start, end);
> > > > 
> > > > Two major design issues that we overlooked long ago:
> > > > 
> > > >   1. Blindly invoking notifiers will not scale.  E.g. if userspace 
> > > > configures a
> > > >  VM with a large number of convertible memslots that are all backed 
> > > > by a
> > > >  single large restrictedmem instance, then converting a single page 
> > > > will
> > > >  result in a linear walk through all memslots.  I don't expect 
> > > > anyone to
> > > >  actually do something silly like that, but I also never expected 
> > > > there to be
> > > >  a legitimate usecase for thousands of memslots.
> > > > 
> > > >   2. This approach fails to provide the ability for KVM to ensure a 
> > > > guest has
> > > >  exclusive access to a page.  As discussed in the past, the kernel 
> > > > can rely
> > > >  on hardware (and maybe ARM's pKVM implementation?) for those 
> > > > guarantees, but
> > > >  only for SNP and TDX VMs.  For VMs where userspace is trusted to 
> > > > some extent,
> > > >  e.g. SEV, there is value in ensuring a 1:1 association.
> > > > 
> > > >  And probably more importantly, relying on hardware for SNP and TDX 
> > > > yields a
> > > >  poor ABI and complicates KVM's internals.  If the kernel doesn't 
> > > > guarantee a
> > > >  page is exclusive to a guest, i.e. if userspace can hand out the 
> > > > same page
> > > >  from a restrictedmem instance to multiple VMs, then failure will 
> > > > occur only
> > > >  when KVM tries to assign the page to the second VM.  That will 
> > > > happen deep
> > > >  in KVM, which means KVM needs to gracefully handle such errors, 
> > > > and it means
> > > >  that KVM's ABI effectively allows plumbing garbage into its 
> > > > memslots.
> > > 
> > > It may not be a valid usage, but in my TDX environment I do meet below
> > > issue.
> > > 
> > > kvm_set_user_memory AddrSpace#0 Slot#0 flags=0x4 gpa=0x0 size=0x8000 
> > > ua=0x7fe1ebfff000 ret=0
> > > kvm_set_user_memory AddrSpace#0 Slot#1 flags=0x4 gpa=0xffc0 
> > > size=0x40 ua=0x7fe271579000 ret=0
> > > kvm_set_user_memory AddrSpace#0 Slot#2 flags=0x4 gpa=0xfeda 
> > > size=0x2 ua=0x7fe1ec09f000 ret=-22
> > > 
> > > Slot#2('SMRAM') is actually an alias into system memory(Slot#0) in QEMU
> > > and slot#2 fails due to below exclusive check.
> > > 
> > > Currently I changed QEMU code to mark these alias slots as shared
> > > instead of private but I'm not 100% confident this is correct fix.
> > 
> > That's a QEMU bug of sorts.  SMM is mutually exclusive with TDX, QEMU 
> > shouldn't
> > be configuring SMRAM (or any SMM memslots for that matter) for TDX guests.
> 
> Thanks for the confirmation. As long as we only bind one notifier for
> each address, using xarray does make things simple.

In the past, I had patches for qemu to disable PAM and SMRAM, but they were
dropped for simplicity because SMRAM/PAM are disabled as reset state with unused
memslot registered. TDX guest bios(TDVF or EDK2) doesn't enable them.
Now we can revive them.
-- 
Isaku Yamahata 



Re: [PATCH v10 7/9] KVM: Update lpage info when private/shared memory are mixed

2022-12-06 Thread Isaku Yamahata
On Tue, Dec 06, 2022 at 08:02:24PM +0800,
Chao Peng  wrote:

> On Mon, Dec 05, 2022 at 02:49:59PM -0800, Isaku Yamahata wrote:
> > On Fri, Dec 02, 2022 at 02:13:45PM +0800,
> > Chao Peng  wrote:
> > 
> > > A large page with mixed private/shared subpages can't be mapped as large
> > > page since its sub private/shared pages are from different memory
> > > backends and may also treated by architecture differently. When
> > > private/shared memory are mixed in a large page, the current lpage_info
> > > is not sufficient to decide whether the page can be mapped as large page
> > > or not and additional private/shared mixed information is needed.
> > > 
> > > Tracking this 'mixed' information with the current 'count' like
> > > disallow_lpage is a bit challenge so reserve a bit in 'disallow_lpage'
> > > to indicate a large page has mixed private/share subpages and update
> > > this 'mixed' bit whenever the memory attribute is changed between
> > > private and shared.
> > > 
> > > Signed-off-by: Chao Peng 
> > > ---
> > >  arch/x86/include/asm/kvm_host.h |   8 ++
> > >  arch/x86/kvm/mmu/mmu.c  | 134 +++-
> > >  arch/x86/kvm/x86.c  |   2 +
> > >  include/linux/kvm_host.h|  19 +
> > >  virt/kvm/kvm_main.c |   9 ++-
> > >  5 files changed, 169 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/arch/x86/include/asm/kvm_host.h 
> > > b/arch/x86/include/asm/kvm_host.h
> > > index 283cbb83d6ae..7772ab37ac89 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -38,6 +38,7 @@
> > >  #include 
> > >  
> > >  #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
> > > +#define __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES
> > >  
> > >  #define KVM_MAX_VCPUS 1024
> > >  
> > > @@ -1011,6 +1012,13 @@ struct kvm_vcpu_arch {
> > >  #endif
> > >  };
> > >  
> > > +/*
> > > + * Use a bit in disallow_lpage to indicate private/shared pages mixed at 
> > > the
> > > + * level. The remaining bits are used as a reference count.
> > > + */
> > > +#define KVM_LPAGE_PRIVATE_SHARED_MIXED   (1U << 31)
> > > +#define KVM_LPAGE_COUNT_MAX  ((1U << 31) - 1)
> > > +
> > >  struct kvm_lpage_info {
> > >   int disallow_lpage;
> > >  };
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index e2c70b5afa3e..2190fd8c95c0 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -763,11 +763,16 @@ static void update_gfn_disallow_lpage_count(const 
> > > struct kvm_memory_slot *slot,
> > >  {
> > >   struct kvm_lpage_info *linfo;
> > >   int i;
> > > + int disallow_count;
> > >  
> > >   for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
> > >   linfo = lpage_info_slot(gfn, slot, i);
> > > +
> > > + disallow_count = linfo->disallow_lpage & KVM_LPAGE_COUNT_MAX;
> > > + WARN_ON(disallow_count + count < 0 ||
> > > + disallow_count > KVM_LPAGE_COUNT_MAX - count);
> > > +
> > >   linfo->disallow_lpage += count;
> > > - WARN_ON(linfo->disallow_lpage < 0);
> > >   }
> > >  }
> > >  
> > > @@ -6986,3 +6991,130 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
> > >   if (kvm->arch.nx_huge_page_recovery_thread)
> > >   kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
> > >  }
> > > +
> > > +static bool linfo_is_mixed(struct kvm_lpage_info *linfo)
> > > +{
> > > + return linfo->disallow_lpage & KVM_LPAGE_PRIVATE_SHARED_MIXED;
> > > +}
> > > +
> > > +static void linfo_set_mixed(gfn_t gfn, struct kvm_memory_slot *slot,
> > > + int level, bool mixed)
> > > +{
> > > + struct kvm_lpage_info *linfo = lpage_info_slot(gfn, slot, level);
> > > +
> > > + if (mixed)
> > > + linfo->disallow_lpage |= KVM_LPAGE_PRIVATE_SHARED_MIXED;
> > > + else
> > > + linfo->disallow_lpage &= ~KVM_LPAGE_PRIVATE_SHARED_MIXED;
> > > +}
> > > +
> > > +static bool is_expected_attr_entry(void *entry, unsigned long 
> > > expected_attrs)
> > > +{
> &

Re: [PATCH v10 5/9] KVM: Use gfn instead of hva for mmu_notifier_retry

2022-12-06 Thread Isaku Yamahata
On Tue, Dec 06, 2022 at 07:56:23PM +0800,
Chao Peng  wrote:

> > > -   if (unlikely(kvm->mmu_invalidate_in_progress) &&
> > > -   hva >= kvm->mmu_invalidate_range_start &&
> > > -   hva < kvm->mmu_invalidate_range_end)
> > > -   return 1;
> > > +   if (unlikely(kvm->mmu_invalidate_in_progress)) {
> > > +   /*
> > > +* Dropping mmu_lock after bumping 
> > > mmu_invalidate_in_progress
> > > +* but before updating the range is a KVM bug.
> > > +*/
> > > +   if (WARN_ON_ONCE(kvm->mmu_invalidate_range_start == 
> > > INVALID_GPA ||
> > > +kvm->mmu_invalidate_range_end == 
> > > INVALID_GPA))
> > 
> > INVALID_GPA is an x86-specific define in
> > arch/x86/include/asm/kvm_host.h, so this doesn't build on other
> > architectures. The obvious fix is to move it to
> > include/linux/kvm_host.h.
> 
> Hmm, INVALID_GPA is defined as ZERO for x86, not 100% confident this is
> correct choice for other architectures, but after search it has not been
> used for other architectures, so should be safe to make it common.

INVALID_GPA is defined as all bit 1.  Please notice "~" (tilde).

#define INVALID_GPA (~(gpa_t)0)
-- 
Isaku Yamahata 



Re: [PATCH v10 7/9] KVM: Update lpage info when private/shared memory are mixed

2022-12-05 Thread Isaku Yamahata
G_MAX_ENTRIES  65536
>  
> +#ifdef __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES
> +void kvm_arch_set_memory_attributes(struct kvm *kvm,
> + struct kvm_memory_slot *slot,
> + unsigned long attrs,
> + gfn_t start, gfn_t end);
> +#else
> +static inline void kvm_arch_set_memory_attributes(struct kvm *kvm,
> +   struct kvm_memory_slot *slot,
> +   unsigned long attrs,
> +   gfn_t start, gfn_t end)
> +{
> +}
> +#endif /* __KVM_HAVE_ARCH_SET_MEMORY_ATTRIBUTES */
> +
>  #endif
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 4e1e1e113bf0..e107afea32f0 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2354,7 +2354,8 @@ static u64 kvm_supported_mem_attributes(struct kvm *kvm)
>   return 0;
>  }
>  
> -static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end)
> +static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t start, gfn_t end,
> + unsigned long attrs)
>  {
>   struct kvm_gfn_range gfn_range;
>   struct kvm_memory_slot *slot;
> @@ -2378,6 +2379,10 @@ static void kvm_unmap_mem_range(struct kvm *kvm, gfn_t 
> start, gfn_t end)
>   gfn_range.slot = slot;
>  
>   r |= kvm_unmap_gfn_range(kvm, _range);
> +
> + kvm_arch_set_memory_attributes(kvm, slot, attrs,
> +gfn_range.start,
> +gfn_range.end);
>   }
>   }
>  
> @@ -2427,7 +2432,7 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm 
> *kvm,
>   idx = srcu_read_lock(>srcu);
>   KVM_MMU_LOCK(kvm);
>   if (i > start)
> - kvm_unmap_mem_range(kvm, start, i);
> + kvm_unmap_mem_range(kvm, start, i, attrs->attributes);
>   kvm_mmu_invalidate_end(kvm);
>   KVM_MMU_UNLOCK(kvm);
>   srcu_read_unlock(>srcu, idx);
> -- 
> 2.25.1
> 

-- 
Isaku Yamahata 



Re: [PATCH v9 0/8] KVM: mm: fd-based approach for supporting KVM

2022-11-07 Thread Isaku Yamahata
On Thu, Nov 03, 2022 at 05:43:52PM +0530,
Vishal Annapurve  wrote:

> On Tue, Oct 25, 2022 at 8:48 PM Chao Peng  wrote:
> >
> > This patch series implements KVM guest private memory for confidential
> > computing scenarios like Intel TDX[1]. If a TDX host accesses
> > TDX-protected guest memory, machine check can happen which can further
> > crash the running host system, this is terrible for multi-tenant
> > configurations. The host accesses include those from KVM userspace like
> > QEMU. This series addresses KVM userspace induced crash by introducing
> > new mm and KVM interfaces so KVM userspace can still manage guest memory
> > via a fd-based approach, but it can never access the guest memory
> > content.
> >
> > The patch series touches both core mm and KVM code. I appreciate
> > Andrew/Hugh and Paolo/Sean can review and pick these patches. Any other
> > reviews are always welcome.
> >   - 01: mm change, target for mm tree
> >   - 02-08: KVM change, target for KVM tree
> >
> > Given KVM is the only current user for the mm part, I have chatted with
> > Paolo and he is OK to merge the mm change through KVM tree, but
> > reviewed-by/acked-by is still expected from the mm people.
> >
> > The patches have been verified in Intel TDX environment, but Vishal has
> > done an excellent work on the selftests[4] which are dedicated for this
> > series, making it possible to test this series without innovative
> > hardware and fancy steps of building a VM environment. See Test section
> > below for more info.
> >
> >
> > Introduction
> > 
> > KVM userspace being able to crash the host is horrible. Under current
> > KVM architecture, all guest memory is inherently accessible from KVM
> > userspace and is exposed to the mentioned crash issue. The goal of this
> > series is to provide a solution to align mm and KVM, on a userspace
> > inaccessible approach of exposing guest memory.
> >
> > Normally, KVM populates secondary page table (e.g. EPT) by using a host
> > virtual address (hva) from core mm page table (e.g. x86 userspace page
> > table). This requires guest memory being mmaped into KVM userspace, but
> > this is also the source where the mentioned crash issue can happen. In
> > theory, apart from those 'shared' memory for device emulation etc, guest
> > memory doesn't have to be mmaped into KVM userspace.
> >
> > This series introduces fd-based guest memory which will not be mmaped
> > into KVM userspace. KVM populates secondary page table by using a
> 
> With no mappings in place for userspace VMM, IIUC, looks like the host
> kernel will not be able to find the culprit userspace process in case
> of Machine check error on guest private memory. As implemented in
> hwpoison_user_mappings, host kernel tries to look at the processes
> which have mapped the pfns with hardware error.
> 
> Is there a modification needed in mce handling logic of the host
> kernel to immediately send a signal to the vcpu thread accessing
> faulting pfn backing guest private memory?

mce_register_decode_chain() can be used.  MCE physical address(p->mce_addr)
includes host key id in addition to real physical address.  By searching used
hkid by KVM, we can determine if the page is assigned to guest TD or not. If
yes, send SIGBUS.

kvm_machine_check() can be enhanced for KVM specific use.  This is before
memory_failure() is called, though.

any other ideas?
-- 
Isaku Yamahata 



Re: [PATCH v9 7/8] KVM: Handle page fault for private memory

2022-10-31 Thread Isaku Yamahata
On Fri, Oct 28, 2022 at 02:55:45PM +0800,
Chao Peng  wrote:

> On Wed, Oct 26, 2022 at 02:54:25PM -0700, Isaku Yamahata wrote:
> > On Tue, Oct 25, 2022 at 11:13:43PM +0800,
> > Chao Peng  wrote:
> > 
> > > A memslot with KVM_MEM_PRIVATE being set can include both fd-based
> > > private memory and hva-based shared memory. Architecture code (like TDX
> > > code) can tell whether the on-going fault is private or not. This patch
> > > adds a 'is_private' field to kvm_page_fault to indicate this and
> > > architecture code is expected to set it.
> > > 
> > > To handle page fault for such memslot, the handling logic is different
> > > depending on whether the fault is private or shared. KVM checks if
> > > 'is_private' matches the host's view of the page (maintained in
> > > mem_attr_array).
> > >   - For a successful match, private pfn is obtained with
> > > restrictedmem_get_page () from private fd and shared pfn is obtained
> > > with existing get_user_pages().
> > >   - For a failed match, KVM causes a KVM_EXIT_MEMORY_FAULT exit to
> > > userspace. Userspace then can convert memory between private/shared
> > > in host's view and retry the fault.
> > > 
> > > Co-developed-by: Yu Zhang 
> > > Signed-off-by: Yu Zhang 
> > > Signed-off-by: Chao Peng 
> > > ---
> > >  arch/x86/kvm/mmu/mmu.c  | 56 +++--
> > >  arch/x86/kvm/mmu/mmu_internal.h | 14 -
> > >  arch/x86/kvm/mmu/mmutrace.h |  1 +
> > >  arch/x86/kvm/mmu/spte.h |  6 
> > >  arch/x86/kvm/mmu/tdp_mmu.c  |  3 +-
> > >  include/linux/kvm_host.h| 28 +
> > >  6 files changed, 103 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 67a9823a8c35..10017a9f26ee 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -3030,7 +3030,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, 
> > > gfn_t gfn,
> > >  
> > >  int kvm_mmu_max_mapping_level(struct kvm *kvm,
> > > const struct kvm_memory_slot *slot, gfn_t gfn,
> > > -   int max_level)
> > > +   int max_level, bool is_private)
> > >  {
> > >   struct kvm_lpage_info *linfo;
> > >   int host_level;
> > > @@ -3042,6 +3042,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
> > >   break;
> > >   }
> > >  
> > > + if (is_private)
> > > + return max_level;
> > 
> > Below PG_LEVEL_NUM is passed by zap_collapsible_spte_range().  It doesn't 
> > make
> > sense.
> > 
> > > +
> > >   if (max_level == PG_LEVEL_4K)
> > >   return PG_LEVEL_4K;
> > >  
> > > @@ -3070,7 +3073,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, 
> > > struct kvm_page_fault *fault
> > >* level, which will be used to do precise, accurate accounting.
> > >*/
> > >   fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
> > > -  fault->gfn, 
> > > fault->max_level);
> > > +  fault->gfn, 
> > > fault->max_level,
> > > +  fault->is_private);
> > >   if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
> > >   return;
> > >  
> > > @@ -4141,6 +4145,32 @@ void kvm_arch_async_page_ready(struct kvm_vcpu 
> > > *vcpu, struct kvm_async_pf *work)
> > >   kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
> > >  }
> > >  
> > > +static inline u8 order_to_level(int order)
> > > +{
> > > + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
> > > +
> > > + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
> > > + return PG_LEVEL_1G;
> > > +
> > > + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
> > > + return PG_LEVEL_2M;
> > > +
> > > + return PG_LEVEL_4K;
> > > +}
> > > +
> > > +static int kvm_faultin_pfn_private(struct kvm_page_fault *fault)
> > > +{
> > > + int order;
> > > + struct kvm_memory_slot *slot = fault->slot;
> > > +
> > > + if (kvm_restricted_mem_get_pfn(slot, fault->gf

Re: [PATCH v9 7/8] KVM: Handle page fault for private memory

2022-10-26 Thread Isaku Yamahata
   return false;
> +}
> +

PFN encoded in spte doesn't make sense.  In VMM for TDX, private-vs-shared is
determined by S-bit of GFN.


>  static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 
> pte,
>   int level)
>  {
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 672f0432d777..9f97aac90606 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1768,7 +1768,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
>   continue;
>  
>   max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
> -   iter.gfn, 
> PG_LEVEL_NUM);
> + iter.gfn, PG_LEVEL_NUM,
> + is_private_spte(iter.old_spte));
>   if (max_mapping_level < iter.level)
>   continue;

This is to merge pages into a large page on the next kvm page fault.  large page
support is not yet supported.  Let's skip the private slot until large page
support is done.
-- 
Isaku Yamahata 



Re: [PATCH v9 6/8] KVM: Update lpage info when private/shared memory are mixed

2022-10-26 Thread Isaku Yamahata
xed(lpage_info_slot(lpage_start, slot, level),
> +mem_attr_is_mixed(kvm, slot, level, attr,
> +  lpage_start, start));
> +
> + if (lpage_start == lpage_end)
> + return;
> +
> +     for (gfn = lpage_start + pages; gfn < lpage_end; gfn += pages)
> + linfo_update_mixed(lpage_info_slot(gfn, slot, level),
> +false);
> +
> + linfo_update_mixed(lpage_info_slot(lpage_end, slot, level),
> +mem_attr_is_mixed(kvm, slot, level, attr,
> +  end, lpage_end + pages));
> + }
> +}

-- 
Isaku Yamahata 



Re: [PATCH v9 1/8] mm: Introduce memfd_restricted system call to create restricted user memory

2022-10-26 Thread Isaku Yamahata
On Tue, Oct 25, 2022 at 11:13:37PM +0800,
Chao Peng  wrote:

> +int restrictedmem_get_page(struct file *file, pgoff_t offset,
> +struct page **pagep, int *order)
> +{
> + struct restrictedmem_data *data = file->f_mapping->private_data;
> + struct file *memfd = data->memfd;
> + struct page *page;
> + int ret;
> +
> + ret = shmem_getpage(file_inode(memfd), offset, , SGP_WRITE);

shmem_getpage() was removed.
https://lkml.kernel.org/r/20220902194653.1739778-34-wi...@infradead.org

I needed the following fix to compile.

thanks,

diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
index e5bf8907e0f8..4694dd5609d6 100644
--- a/mm/restrictedmem.c
+++ b/mm/restrictedmem.c
@@ -231,13 +231,15 @@ int restrictedmem_get_page(struct file *file, pgoff_t 
offset,
 {
struct restrictedmem_data *data = file->f_mapping->private_data;
struct file *memfd = data->memfd;
+   struct folio *folio = NULL;
struct page *page;
int ret;
 
-   ret = shmem_getpage(file_inode(memfd), offset, , SGP_WRITE);
+   ret = shmem_get_folio(file_inode(memfd), offset, , SGP_WRITE);
if (ret)
return ret;
 
+   page = folio_file_page(folio, offset);
*pagep = page;
if (order)
*order = thp_order(compound_head(page));
-- 
Isaku Yamahata 



Re: [PATCH v8 2/8] KVM: Extend the memslot to support fd-based private memory

2022-09-29 Thread Isaku Yamahata
On Thu, Sep 15, 2022 at 10:29:07PM +0800,
Chao Peng  wrote:
...
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 584a5bab3af3..12dc0dc57b06 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
...
> @@ -4622,6 +4622,33 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
>   return fd;
>  }
>  
> +#define SANITY_CHECK_MEM_REGION_FIELD(field) 
> \
> +do { 
> \
> + BUILD_BUG_ON(offsetof(struct kvm_user_mem_region, field) != 
> \
> +  offsetof(struct kvm_userspace_memory_region, field));  
> \
> + BUILD_BUG_ON(sizeof_field(struct kvm_user_mem_region, field) != 
> \
> +  sizeof_field(struct kvm_userspace_memory_region, field));  
> \
> +} while (0)
> +
> +#define SANITY_CHECK_MEM_REGION_EXT_FIELD(field) 
> \
> +do { 
> \
> + BUILD_BUG_ON(offsetof(struct kvm_user_mem_region, field) != 
> \
> +  offsetof(struct kvm_userspace_memory_region_ext, field));  
> \
> + BUILD_BUG_ON(sizeof_field(struct kvm_user_mem_region, field) != 
> \
> +  sizeof_field(struct kvm_userspace_memory_region_ext, 
> field));  \
> +} while (0)
> +
> +static void kvm_sanity_check_user_mem_region_alias(void)
> +{
> + SANITY_CHECK_MEM_REGION_FIELD(slot);
> + SANITY_CHECK_MEM_REGION_FIELD(flags);
> + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
> + SANITY_CHECK_MEM_REGION_FIELD(memory_size);
> + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
> + SANITY_CHECK_MEM_REGION_EXT_FIELD(private_offset);
> + SANITY_CHECK_MEM_REGION_EXT_FIELD(private_fd);
> +}
> +
>  static long kvm_vm_ioctl(struct file *filp,
>  unsigned int ioctl, unsigned long arg)
>  {
> @@ -4645,14 +4672,20 @@ static long kvm_vm_ioctl(struct file *filp,
>   break;
>   }
>   case KVM_SET_USER_MEMORY_REGION: {
> - struct kvm_userspace_memory_region kvm_userspace_mem;
> + struct kvm_user_mem_region mem;
> + unsigned long size = sizeof(struct kvm_userspace_memory_region);
> +
> + kvm_sanity_check_user_mem_region_alias();
>  
>   r = -EFAULT;
> - if (copy_from_user(_userspace_mem, argp,
> - sizeof(kvm_userspace_mem)))
> + if (copy_from_user(, argp, size);
> + goto out;
> +
> + r = -EINVAL;
> + if (mem.flags & KVM_MEM_PRIVATE)
>       goto out;

Nit:  It's better to check if padding is zero.  Maybe rename it to reserved.

+   if (mem.pad1 || memchr_inv(mem.pad2, 0, sizeof(mem.pad2)))
+   goto out;
-- 
Isaku Yamahata 



Re: [PATCH v8 6/8] KVM: Update lpage info when private/shared memory are mixed

2022-09-29 Thread Isaku Yamahata
pages = KVM_PAGES_PER_HPAGE(level);
gfn_t mask = ~(pages - 1);
struct kvm_lpage_info *linfo = lpage_info_slot(gfn & mask, slot, level);

WARN_ON_ONCE(level == PG_LEVEL_4K);
return linfo->disallow_lpage & KVM_LPAGE_PRIVATE_SHARED_MIXED;
}

#ifdef CONFIG_HAVE_KVM_PRIVATE_MEM_ATTR
static void update_mixed(struct kvm_lpage_info *linfo, bool mixed)
{
if (mixed)
linfo->disallow_lpage |= KVM_LPAGE_PRIVATE_SHARED_MIXED;
else
linfo->disallow_lpage &= ~KVM_LPAGE_PRIVATE_SHARED_MIXED;
}

static bool __mem_attr_is_mixed(struct kvm *kvm, gfn_t start, gfn_t end)
{
XA_STATE(xas, >mem_attr_array, start);
bool mixed = false;
gfn_t gfn = start;
void *s_entry;
void *entry;

rcu_read_lock();
s_entry = xas_load();
entry = s_entry;
while (gfn < end) {
if (xas_retry(, entry))
continue;

KVM_BUG_ON(gfn != xas.xa_index, kvm);

entry = xas_next();
if (entry != s_entry) {
mixed = true;
break;
}
gfn++;
}
rcu_read_unlock();
return mixed;
}

static bool mem_attr_is_mixed(struct kvm *kvm,
  struct kvm_memory_slot *slot, int level,
  gfn_t start, gfn_t end)
{
struct kvm_lpage_info *child_linfo;
unsigned long child_pages;
bool mixed = false;
unsigned long gfn;
void *entry;

if (WARN_ON_ONCE(level == PG_LEVEL_4K))
return false;

if (level == PG_LEVEL_2M)
return __mem_attr_is_mixed(kvm, start, end);

/* This assumes that level - 1 is already updated. */
rcu_read_lock();
child_pages = KVM_PAGES_PER_HPAGE(level - 1);
entry = xa_load(>mem_attr_array, start);
for (gfn = start; gfn < end; gfn += child_pages) {
child_linfo = lpage_info_slot(gfn, slot, level - 1);
if (child_linfo->disallow_lpage & 
KVM_LPAGE_PRIVATE_SHARED_MIXED) {
mixed = true;
break;
}
if (xa_load(>mem_attr_array, gfn) != entry) {
mixed = true;
break;
}
}
rcu_read_unlock();
return mixed;
}

static void update_mem_lpage_info(struct kvm *kvm,
  struct kvm_memory_slot *slot,
  unsigned int attr,
  gfn_t start, gfn_t end)
{
unsigned long lpage_start, lpage_end;
unsigned long gfn, pages, mask;
int level;

for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
pages = KVM_PAGES_PER_HPAGE(level);
mask = ~(pages - 1);
lpage_start = start & mask;
lpage_end = (end - 1) & mask;

/*
 * We only need to scan the head and tail page, for middle pages
 * we know they are not mixed.
 */
update_mixed(lpage_info_slot(lpage_start, slot, level),
 mem_attr_is_mixed(kvm, slot, level,
   lpage_start, lpage_start + 
pages));

if (lpage_start == lpage_end)
return;

for (gfn = lpage_start + pages; gfn < lpage_end; gfn += pages)
update_mixed(lpage_info_slot(gfn, slot, level), false);

update_mixed(lpage_info_slot(lpage_end, slot, level),
 mem_attr_is_mixed(kvm, slot, level,
   lpage_end, lpage_end + pages));
}
}

void kvm_arch_update_mem_attr(struct kvm *kvm, unsigned int attr,
  gfn_t start, gfn_t end)
{
struct kvm_memory_slot *slot;
struct kvm_memslots *slots;
struct kvm_memslot_iter iter;
int idx;
int i;

WARN_ONCE(!(attr & (KVM_MEM_ATTR_PRIVATE | KVM_MEM_ATTR_SHARED)),
  "Unsupported mem attribute.\n");

idx = srcu_read_lock(>srcu);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);

    kvm_for_each_memslot_in_gfn_range(, slots, start, end) {
slot = iter.slot;
start = max(start, slot->base_gfn);
end = min(end, slot->base_gfn + slot->npages);
if (WARN_ON_ONCE(start >= end))
continue;

update_mem_lpage_info(kvm, slot, attr, start, end);
}
}
srcu_read_unlock(>srcu, idx);
}
#endif


-- 
Isaku Yamahata 



Re: [PATCH v7 00/14] KVM: mm: fd-based approach for supporting KVM guest private memory

2022-08-22 Thread Isaku Yamahata
On Wed, Aug 17, 2022 at 10:27:19AM -0500,
Michael Roth  wrote:

> > I think the best approach is to turn KVM_TDX_INIT_MEM_REGION into a generic
> > vCPU-scoped ioctl() that allows userspace to pre-map guest memory.  
> > Supporting
> > initializing guest private memory with a source page can be implemented via 
> > a
> > flag.  That also gives KVM line of sight to in-place "conversion", e.g. 
> > another
> > flag could be added to say that the dest is also the source.
> 
> So is this proposed ioctl only intended to handle the initial encrypted
> payload, and the KVM_MEMORY_ENCRYPT_{REG,UNREG}_REGION ioctls would
> still be used for conversions post-boot?

Yes.  It is called before running any vcpu.  At run time (after running vcpus),
KVM_MEMORY_ENCRYPT_{REG,UNREG}_REGION is used.
-- 
Isaku Yamahata 



Re: [PATCH v7 07/14] KVM: Use gfn instead of hva for mmu_notifier_retry

2022-08-04 Thread Isaku Yamahata
On Wed, Jul 06, 2022 at 04:20:09PM +0800,
Chao Peng  wrote:

> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 0bdb6044e316..e9153b54e2a4 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1362,10 +1362,8 @@ void kvm_mmu_free_memory_cache(struct 
> kvm_mmu_memory_cache *mc);
>  void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
>  #endif
>  
> -void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
> -unsigned long end);
> -void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
> -unsigned long end);
> +void kvm_inc_notifier_count(struct kvm *kvm, gfn_t start, gfn_t end);
> +void kvm_dec_notifier_count(struct kvm *kvm, gfn_t start, gfn_t end);
>  
>  long kvm_arch_dev_ioctl(struct file *filp,
>   unsigned int ioctl, unsigned long arg);

The corresponding changes in kvm_main.c are missing.

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b2c79bef61bd..0184e327f6f5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -711,8 +711,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier 
*mn,
kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
 }
 
-void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
-  unsigned long end)
+void kvm_inc_notifier_count(struct kvm *kvm, gfn_t start, gfn_t end)
 {
/*
 * The count increase must become visible at unlock time as no
@@ -786,8 +785,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct 
mmu_notifier *mn,
return 0;
 }
 
-void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
-  unsigned long end)
+void kvm_dec_notifier_count(struct kvm *kvm, gfn_t start, gfn_t end)
 {
/*
 * This sequence increase will notify the kvm page fault that


-- 
Isaku Yamahata 



Re: [RFC PATCH v4 22/36] i386/tdx: Track RAM entries for TDX VM

2022-05-26 Thread Isaku Yamahata
On Thu, May 26, 2022 at 03:33:10PM +0800,
Xiaoyao Li  wrote:

> On 5/24/2022 3:37 PM, Gerd Hoffmann wrote:
> > I think all this can be simplified, by
> >(1) Change the existing entry to cover the accepted ram range.
> >(2) If there is room before the accepted ram range add a
> >TDX_RAM_UNACCEPTED entry for that.
> >(3) If there is room after the accepted ram range add a
> >TDX_RAM_UNACCEPTED entry for that.
> 
> I implement as below. Please help review.
> 
> +static int tdx_accept_ram_range(uint64_t address, uint64_t length)
> +{
> +uint64_t head_start, tail_start, head_length, tail_length;
> +uint64_t tmp_address, tmp_length;
> +TdxRamEntry *e;
> +int i;
> +
> +for (i = 0; i < tdx_guest->nr_ram_entries; i++) {
> +e = _guest->ram_entries[i];
> +
> +if (address + length < e->address ||
> +e->address + e->length < address) {
> +continue;
> +}
> +
> +/*
> + * The to-be-accepted ram range must be fully contained by one
> + * RAM entries
> + */
> +if (e->address > address ||
> +e->address + e->length < address + length) {
> +return -EINVAL;
> +}
> +
> +if (e->type == TDX_RAM_ADDED) {
> +return -EINVAL;
> +}
> +
> +tmp_address = e->address;
> +tmp_length = e->length;
> +
> +e->address = address;
> +e->length = length;
> +e->type = TDX_RAM_ADDED;
> +
> +head_length = address - tmp_address;
> +if (head_length > 0) {
> +head_start = e->address;
> +    tdx_add_ram_entry(head_start, head_length, TDX_RAM_UNACCEPTED);

tdx_add_ram_entry() increments tdx_guest->nr_ram_entries.  I think it's worth
for comments why this is safe regarding to this for-loop.
-- 
Isaku Yamahata 



Re: [RFC PATCH v4 07/36] i386/tdx: Introduce is_tdx_vm() helper and cache tdx_guest object

2022-05-23 Thread Isaku Yamahata
On Mon, May 23, 2022 at 10:48:17AM +0200,
Gerd Hoffmann  wrote:

> > diff --git a/target/i386/kvm/tdx.h b/target/i386/kvm/tdx.h
> > index c8a23d95258d..4036ca2f3f99 100644
> > --- a/target/i386/kvm/tdx.h
> > +++ b/target/i386/kvm/tdx.h
> > @@ -1,6 +1,10 @@
> >  #ifndef QEMU_I386_TDX_H
> >  #define QEMU_I386_TDX_H
> >  
> > +#ifndef CONFIG_USER_ONLY
> > +#include CONFIG_DEVICES /* CONFIG_TDX */
> > +#endif
> > +
> >  #include "exec/confidential-guest-support.h"
> >  
> >  #define TYPE_TDX_GUEST "tdx-guest"
> > @@ -16,6 +20,12 @@ typedef struct TdxGuest {
> >  uint64_t attributes;/* TD attributes */
> >  } TdxGuest;
> >  
> > +#ifdef CONFIG_TDX
> > +bool is_tdx_vm(void);
> > +#else
> > +#define is_tdx_vm() 0
> 
> Just add that to the tdx-stubs.c file you already created in one of the
> previous patches and drop this #ifdef mess ;)

This is for consistency with SEV.  Anyway Either way is okay.

>From target/i386/sev.h
  ...
  #ifdef CONFIG_SEV
  bool sev_enabled(void);
  bool sev_es_enabled(void);
  #else
  #define sev_enabled() 0
  #define sev_es_enabled() 0
  #endif

-- 
Isaku Yamahata 



Re: [RFC PATCH v4 03/36] target/i386: Implement mc->kvm_type() to get VM type

2022-05-23 Thread Isaku Yamahata
On Mon, May 23, 2022 at 10:36:16AM +0200,
Gerd Hoffmann  wrote:

>   Hi,
> 
> > +if (!(kvm_check_extension(KVM_STATE(ms->accelerator), 
> > KVM_CAP_VM_TYPES) & BIT(kvm_type))) {
> > +error_report("vm-type %s not supported by KVM", 
> > vm_type_name[kvm_type]);
> > +exit(1);
> > +}
> 
> Not sure why TDX needs a new vm type whereas sev doesn't.  But that's up
> for debate in the kernel tdx patches, not here.  Assuming the kernel
> interface actually merged will look like this the patch makes sense.

Because VM operations, e.g. KVM_CREATE_VCPU, require TDX specific one in KVM
side, we need to tell this VM is TD.
Also it's for consistency.  It's common pattern to specify vm type with
KVM_CREATE_VM when among other archs.  S390, PPC, MIPS, and ARM64.  Only SEV is
an exception.  It makes default VM into confidential VM after KVM_CREATE_VM.

Thanks,

> 
> Acked-by: Gerd Hoffmann 
> 
> take care,
>   Gerd
> 
> 

-- 
Isaku Yamahata 



Re: [RFC PATCH v4 14/36] i386/tdx: Implement user specified tsc frequency

2022-05-12 Thread Isaku Yamahata
On Thu, May 12, 2022 at 11:17:41AM +0800,
Xiaoyao Li  wrote:

> Reuse "-cpu,tsc-frequency=" to get user wanted tsc frequency and pass it
> to KVM_TDX_INIT_VM.
> 
> Besides, sanity check the tsc frequency to be in the legal range and
> legal granularity (required by TDX module).

Just to make it sure.
You didn't use VM-scoped KVM_SET_TSC_KHZ because KVM side patch is still in
kvm/queue?  Once the patch lands, we should use it.

Thanks,

> 
> Signed-off-by: Xiaoyao Li 
> ---
>  target/i386/kvm/kvm.c |  8 
>  target/i386/kvm/tdx.c | 18 ++
>  2 files changed, 26 insertions(+)
> 
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index f2d7c3cf59ac..c51125ab200f 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -818,6 +818,14 @@ static int kvm_arch_set_tsc_khz(CPUState *cs)
>  int r, cur_freq;
>  bool set_ioctl = false;
>  
> +/*
> + * TD guest's TSC is immutable, it cannot be set/changed via
> + * KVM_SET_TSC_KHZ, but only be initialized via KVM_TDX_INIT_VM
> + */
> +if (is_tdx_vm()) {
> +return 0;
> +}
> +
>  if (!env->tsc_khz) {
>  return 0;
>  }
> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index 9f2cdf640b5c..622efc409438 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -35,6 +35,9 @@
>  #define TDX_TD_ATTRIBUTES_PKS   BIT_ULL(30)
>  #define TDX_TD_ATTRIBUTES_PERFMON   BIT_ULL(63)
>  
> +#define TDX_MIN_TSC_FREQUENCY_KHZ   (100 * 1000)
> +#define TDX_MAX_TSC_FREQUENCY_KHZ   (10 * 1000 * 1000)
> +
>  static TdxGuest *tdx_guest;
>  
>  /* It's valid after kvm_confidential_guest_init()->kvm_tdx_init() */
> @@ -211,6 +214,20 @@ int tdx_pre_create_vcpu(CPUState *cpu)
>  goto out;
>  }
>  
> +r = -EINVAL;
> +if (env->tsc_khz && (env->tsc_khz < TDX_MIN_TSC_FREQUENCY_KHZ ||
> + env->tsc_khz > TDX_MAX_TSC_FREQUENCY_KHZ)) {
> +error_report("Invalid TSC %ld KHz, must specify cpu_frequency 
> between [%d, %d] kHz",
> +  env->tsc_khz, TDX_MIN_TSC_FREQUENCY_KHZ,
> +  TDX_MAX_TSC_FREQUENCY_KHZ);
> +goto out;
> +}
> +
> +if (env->tsc_khz % (25 * 1000)) {
> +error_report("Invalid TSC %ld KHz, it must be multiple of 25MHz", 
> env->tsc_khz);
> +goto out;
> +}
> +
>  r = setup_td_guest_attributes(x86cpu);
>  if (r) {
>  goto out;
> @@ -221,6 +238,7 @@ int tdx_pre_create_vcpu(CPUState *cpu)
>  
>  init_vm.attributes = tdx_guest->attributes;
>  init_vm.max_vcpus = ms->smp.cpus;
> +init_vm.tsc_khz = env->tsc_khz;
>  
>  r = tdx_vm_ioctl(KVM_TDX_INIT_VM, 0, _vm);
>  if (r < 0) {
> -- 
> 2.27.0
> 
> 

-- 
Isaku Yamahata 



Re: [RFC PATCH v4 23/36] i386/tdx: Setup the TD HOB list

2022-05-12 Thread Isaku Yamahata
On Thu, May 12, 2022 at 11:17:50AM +0800,
Xiaoyao Li  wrote:

> The TD HOB list is used to pass the information from VMM to TDVF. The TD
> HOB must include PHIT HOB and Resource Descriptor HOB. More details can
> be found in TDVF specification and PI specification.
> 
> Build the TD HOB in TDX's machine_init_done callback.

Because HOB is introduced first time, please expand HOB.


> Co-developed-by: Isaku Yamahata 
> Signed-off-by: Isaku Yamahata 
> Co-developed-by: Sean Christopherson 
> Signed-off-by: Sean Christopherson 
> Signed-off-by: Xiaoyao Li 
> ---
>  hw/i386/meson.build   |   2 +-
>  hw/i386/tdvf-hob.c| 212 ++
>  hw/i386/tdvf-hob.h|  25 +
>  hw/i386/uefi.h| 198 +++
>  target/i386/kvm/tdx.c |  16 
>  5 files changed, 452 insertions(+), 1 deletion(-)
>  create mode 100644 hw/i386/tdvf-hob.c
>  create mode 100644 hw/i386/tdvf-hob.h
>  create mode 100644 hw/i386/uefi.h
> 
> diff --git a/hw/i386/meson.build b/hw/i386/meson.build
> index 97f3b50503b0..b59e0d35bba3 100644
> --- a/hw/i386/meson.build
> +++ b/hw/i386/meson.build
> @@ -28,7 +28,7 @@ i386_ss.add(when: 'CONFIG_PC', if_true: files(
>'port92.c'))
>  i386_ss.add(when: 'CONFIG_X86_FW_OVMF', if_true: files('pc_sysfw_ovmf.c'),
>  if_false: 
> files('pc_sysfw_ovmf-stubs.c'))
> -i386_ss.add(when: 'CONFIG_TDX', if_true: files('tdvf.c'))
> +i386_ss.add(when: 'CONFIG_TDX', if_true: files('tdvf.c', 'tdvf-hob.c'))
>  
>  subdir('kvm')
>  subdir('xen')
> diff --git a/hw/i386/tdvf-hob.c b/hw/i386/tdvf-hob.c
> new file mode 100644
> index ..31160e9f95c5
> --- /dev/null
> +++ b/hw/i386/tdvf-hob.c
> @@ -0,0 +1,212 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0-or-later
> +
> + * Copyright (c) 2020 Intel Corporation
> + * Author: Isaku Yamahata 
> + *
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/log.h"
> +#include "e820_memory_layout.h"
> +#include "hw/i386/pc.h"
> +#include "hw/i386/x86.h"
> +#include "hw/pci/pcie_host.h"
> +#include "sysemu/kvm.h"
> +#include "tdvf-hob.h"
> +#include "uefi.h"
> +
> +typedef struct TdvfHob {
> +hwaddr hob_addr;
> +void *ptr;
> +int size;
> +
> +/* working area */
> +void *current;
> +void *end;
> +} TdvfHob;
> +
> +static uint64_t tdvf_current_guest_addr(const TdvfHob *hob)
> +{
> +return hob->hob_addr + (hob->current - hob->ptr);
> +}
> +
> +static void tdvf_align(TdvfHob *hob, size_t align)
> +{
> +hob->current = QEMU_ALIGN_PTR_UP(hob->current, align);
> +}
> +
> +static void *tdvf_get_area(TdvfHob *hob, uint64_t size)
> +{
> +void *ret;
> +
> +if (hob->current + size > hob->end) {
> +error_report("TD_HOB overrun, size = 0x%" PRIx64, size);
> +exit(1);
> +}
> +
> +ret = hob->current;
> +hob->current += size;
> +tdvf_align(hob, 8);
> +return ret;
> +}
> +
> +static void tdvf_hob_add_mmio_resource(TdvfHob *hob, uint64_t start,
> +   uint64_t end)
> +{
> +EFI_HOB_RESOURCE_DESCRIPTOR *region;
> +
> +if (!start) {
> +return;
> +}
> +
> +region = tdvf_get_area(hob, sizeof(*region));
> +*region = (EFI_HOB_RESOURCE_DESCRIPTOR) {
> +.Header = {
> +.HobType = EFI_HOB_TYPE_RESOURCE_DESCRIPTOR,
> +.HobLength = cpu_to_le16(sizeof(*region)),
> +.Reserved = cpu_to_le32(0),
> +},
> +.Owner = EFI_HOB_OWNER_ZERO,
> +.ResourceType = cpu_to_le32(EFI_RESOURCE_MEMORY_MAPPED_IO),
> +.ResourceAttribute = cpu_to_le32(EFI_RESOURCE_ATTRIBUTE_TDVF_MMIO),
> +.PhysicalStart = cpu_to_le64(start),
> +.ResourceLength = cpu_to_le64(end - start),
> +};

Re: [RFC PATCH v4 36/36] docs: Add TDX documentation

2022-05-12 Thread Isaku Yamahata
 is DEBUG bit, which decides if the TD runs in off-TD
> +debug mode. When in off-TD debug mode, TD's VCPU state and private memory are
> +accessible via given SEAMCALLs. This requires KVM to expose APIs to invoke 
> those
> +SEAMCALLs and resonponding QEMU change.
> +
> +It's targeted as future work.
> +
> +restrictions
> +
> +
> + - No readonly support for private memory;
> +
> + - No SMM support: SMM support requires manipulating the guset register 
> states
> +   which is not allowed;
> +
> +Live Migration
> +--
> +
> +TODO
> +
> +References
> +--
> +
> +- `TDX Homepage 
> <https://www.intel.com/content/www/us/en/developer/articles/technical/intel-trust-domain-extensions.html>`__
> diff --git a/docs/system/target-i386.rst b/docs/system/target-i386.rst
> index 96bf54889a82..16dd4f1a8c80 100644
> --- a/docs/system/target-i386.rst
> +++ b/docs/system/target-i386.rst
> @@ -29,6 +29,7 @@ Architectural features
> i386/kvm-pv
> i386/sgx
> i386/amd-memory-encryption
> +   i386/tdx
>  
>  .. _pcsys_005freq:
>  
> -- 
> 2.27.0
> 
> 

-- 
Isaku Yamahata 



Re: [RFC PATCH v4 10/36] i386/kvm: Move architectural CPUID leaf generation to separate helper

2022-05-12 Thread Isaku Yamahata
On Thu, May 12, 2022 at 11:17:37AM +0800,
Xiaoyao Li  wrote:

> diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h
> index b434feaa6b1d..5c7972f617e8 100644
> --- a/target/i386/kvm/kvm_i386.h
> +++ b/target/i386/kvm/kvm_i386.h
> @@ -24,6 +24,10 @@
>  #define kvm_ioapic_in_kernel() \
>  (kvm_irqchip_in_kernel() && !kvm_irqchip_is_split())
>  
> +#define KVM_MAX_CPUID_ENTRIES  100

In Linux side, the value was bumped to 256.  Opportunistically let's make it
same.

3f4e3eb417b1 KVM: x86: bump KVM_MAX_CPUID_ENTRIES

> +uint32_t kvm_x86_arch_cpuid(CPUX86State *env, struct kvm_cpuid_entry2 
> *entries,
> +uint32_t cpuid_i);
> +
>  #else
>  
>  #define kvm_pit_in_kernel()  0
> -- 
> 2.27.0
> 
> 

-- 
Isaku Yamahata 



Re: [RFC PATCH v4 24/36] i386/tdx: Add TDVF memory via KVM_TDX_INIT_MEM_REGION

2022-05-12 Thread Isaku Yamahata
On Thu, May 12, 2022 at 11:17:51AM +0800,
Xiaoyao Li  wrote:

> From: Isaku Yamahata 
> 
> TDVF firmware (CODE and VARS) needs to be added/copied to TD's private
> memory via KVM_TDX_INIT_MEM_REGION, as well as TD HOB and TEMP memory.
> 
> Signed-off-by: Isaku Yamahata 
> Signed-off-by: Xiaoyao Li 
> ---
>  target/i386/kvm/tdx.c | 24 
>  1 file changed, 24 insertions(+)
> 
> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index 3e18ace90bf7..567ee12e88f0 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -240,6 +240,7 @@ static void tdx_finalize_vm(Notifier *notifier, void 
> *unused)
>  {
>  TdxFirmware *tdvf = _guest->tdvf;
>  TdxFirmwareEntry *entry;
> +int r;
>  
>  tdx_init_ram_entries();
>  
> @@ -265,6 +266,29 @@ static void tdx_finalize_vm(Notifier *notifier, void 
> *unused)
>sizeof(TdxRamEntry), _ram_entry_compare);
>  
>  tdvf_hob_create(tdx_guest, tdx_get_hob_entry(tdx_guest));
> +
> +for_each_tdx_fw_entry(tdvf, entry) {
> +struct kvm_tdx_init_mem_region mem_region = {
> +.source_addr = (__u64)entry->mem_ptr,
> +.gpa = entry->address,
> +.nr_pages = entry->size / 4096,
> +};
> +
> +__u32 metadata = entry->attributes & 
> TDVF_SECTION_ATTRIBUTES_MR_EXTEND ?
> + KVM_TDX_MEASURE_MEMORY_REGION : 0;

Please use flags instead of metadata.


> +r = tdx_vm_ioctl(KVM_TDX_INIT_MEM_REGION, metadata, _region);
> +if (r < 0) {
> + error_report("KVM_TDX_INIT_MEM_REGION failed %s", strerror(-r));
> + exit(1);
> +}
> +
> +if (entry->type == TDVF_SECTION_TYPE_TD_HOB ||
> +entry->type == TDVF_SECTION_TYPE_TEMP_MEM) {
> +qemu_ram_munmap(-1, entry->mem_ptr, entry->size);
> +entry->mem_ptr = NULL;
> +}
> +}
>  }
>  
>  static Notifier tdx_machine_done_notify = {
> -- 
> 2.27.0
> 
> 

-- 
Isaku Yamahata 



Re: [RFC PATCH v4 06/36] i386/tdx: Get tdx_capabilities via KVM_TDX_CAPABILITIES

2022-05-12 Thread Isaku Yamahata
On Thu, May 12, 2022 at 11:17:33AM +0800,
Xiaoyao Li  wrote:

> KVM provides TDX capabilities via sub command KVM_TDX_CAPABILITIES of
> IOCTL(KVM_MEMORY_ENCRYPT_OP). Get the capabilities when initializing
> TDX context. It will be used to validate user's setting later.
> 
> Besides, introduce the interfaces to invoke TDX "ioctls" at different
> scope (KVM, VM and VCPU) in preparation.
> 
> Signed-off-by: Xiaoyao Li 
> ---
>  target/i386/kvm/tdx.c | 85 +++
>  1 file changed, 85 insertions(+)
> 
> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index 77e33ae01147..68bedbad0ebe 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -14,12 +14,97 @@
>  #include "qemu/osdep.h"
>  #include "qapi/error.h"
>  #include "qom/object_interfaces.h"
> +#include "sysemu/kvm.h"
>  
>  #include "hw/i386/x86.h"
>  #include "tdx.h"
>  
> +enum tdx_ioctl_level{
> +TDX_PLATFORM_IOCTL,
> +TDX_VM_IOCTL,
> +TDX_VCPU_IOCTL,
> +};
> +
> +static int __tdx_ioctl(void *state, enum tdx_ioctl_level level, int cmd_id,
> +__u32 flags, void *data)
> +{
> +struct kvm_tdx_cmd tdx_cmd;
> +int r;
> +
> +memset(_cmd, 0x0, sizeof(tdx_cmd));
> +
> +tdx_cmd.id = cmd_id;
> +tdx_cmd.flags = flags;
> +tdx_cmd.data = (__u64)(unsigned long)data;
> +
> +switch (level) {
> +case TDX_PLATFORM_IOCTL:
> +r = kvm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, _cmd);
> +break;
> +case TDX_VM_IOCTL:
> +r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, _cmd);
> +break;
> +case TDX_VCPU_IOCTL:
> +r = kvm_vcpu_ioctl(state, KVM_MEMORY_ENCRYPT_OP, _cmd);
> +break;
> +default:
> +error_report("Invalid tdx_ioctl_level %d", level);
> +exit(1);
> +}
> +
> +return r;
> +}
> +
> +static inline int tdx_platform_ioctl(int cmd_id, __u32 metadata, void *data)

nitpick:  Because metadata was renamed to flags for clarity, please update
those.

> +{
> +return __tdx_ioctl(NULL, TDX_PLATFORM_IOCTL, cmd_id, metadata, data);
> +}
> +
> +static inline int tdx_vm_ioctl(int cmd_id, __u32 metadata, void *data)
> +{
> +return __tdx_ioctl(NULL, TDX_VM_IOCTL, cmd_id, metadata, data);
> +}
> +
> +static inline int tdx_vcpu_ioctl(void *vcpu_fd, int cmd_id, __u32 metadata,
> + void *data)
> +{
> +return  __tdx_ioctl(vcpu_fd, TDX_VCPU_IOCTL, cmd_id, metadata, data);
> +}
> +
> +static struct kvm_tdx_capabilities *tdx_caps;
> +
> +static void get_tdx_capabilities(void)
> +{
> +struct kvm_tdx_capabilities *caps;
> +int max_ent = 1;

Because we know the number of entries for TDX 1.0. We can start with better
value with comment on it.


> +int r, size;
> +
> +do {
> +size = sizeof(struct kvm_tdx_capabilities) +
> +   max_ent * sizeof(struct kvm_tdx_cpuid_config);
> +caps = g_malloc0(size);
> +caps->nr_cpuid_configs = max_ent;
> +
> +r = tdx_platform_ioctl(KVM_TDX_CAPABILITIES, 0, caps);
> +if (r == -E2BIG) {
> +g_free(caps);
> +max_ent *= 2;
> +} else if (r < 0) {
> +    error_report("KVM_TDX_CAPABILITIES failed: %s\n", strerror(-r));
> +exit(1);
> +}
> +}
> +while (r == -E2BIG);
> +
> +tdx_caps = caps;
> +}
> +
>  int tdx_kvm_init(MachineState *ms, Error **errp)
>  {
> +if (!tdx_caps) {
> +get_tdx_capabilities();
> +}
> +
>  return 0;
>  }
>  
> -- 
> 2.27.0
> 
> 

-- 
Isaku Yamahata 



Re: [RFC PATCH v4 09/36] KVM: Introduce kvm_arch_pre_create_vcpu()

2022-05-12 Thread Isaku Yamahata
On Thu, May 12, 2022 at 11:17:36AM +0800,
Xiaoyao Li  wrote:

> Introduce kvm_arch_pre_create_vcpu(), to perform arch-dependent
> work prior to create any vcpu. This is for i386 TDX because it needs
> call TDX_INIT_VM before creating any vcpu.

Because "11/36 i386/tdx: Initialize TDX before creating TD vcpus" uses
kvm_arch_pre_create_vcpu() (and 10/36 doesn't use it), please move this patch
right before 11/36. (swap 09/36 and 10/36).

Thanks,

> Signed-off-by: Xiaoyao Li 
> ---
>  accel/kvm/kvm-all.c  | 12 
>  include/sysemu/kvm.h |  1 +
>  2 files changed, 13 insertions(+)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 32e177bd26b4..e6fa9d23207a 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -457,6 +457,11 @@ static int kvm_get_vcpu(KVMState *s, unsigned long 
> vcpu_id)
>  return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
>  }
>  
> +int __attribute__ ((weak)) kvm_arch_pre_create_vcpu(CPUState *cpu)
> +{
> +return 0;
> +}
> +
>  int kvm_init_vcpu(CPUState *cpu, Error **errp)
>  {
>  KVMState *s = kvm_state;
> @@ -465,6 +470,13 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
>  
>  trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
>  
> +ret = kvm_arch_pre_create_vcpu(cpu);
> +if (ret < 0) {
> +error_setg_errno(errp, -ret,
> + "kvm_init_vcpu: kvm_arch_pre_create_vcpu() failed");
> +goto err;
> +}
> +
>  ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
>  if (ret < 0) {
>  error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed 
> (%lu)",
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index a783c7886811..0e94031ab7c7 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -373,6 +373,7 @@ int kvm_arch_put_registers(CPUState *cpu, int level);
>  
>  int kvm_arch_init(MachineState *ms, KVMState *s);
>  
> +int kvm_arch_pre_create_vcpu(CPUState *cpu);
>  int kvm_arch_init_vcpu(CPUState *cpu);
>  int kvm_arch_destroy_vcpu(CPUState *cpu);
>  
> -- 
> 2.27.0
> 
> 

-- 
Isaku Yamahata 



Re: [RFC PATCH v3 12/36] i386/tdx: Add property sept-ve-disable for tdx-guest object

2022-03-24 Thread Isaku Yamahata
On Thu, Mar 24, 2022 at 10:37:25AM +0100,
Gerd Hoffmann  wrote:

> > #VE can be triggered in various situations. e.g., CPUID on some leaves, and
> > RD/WRMSR on some MSRs. #VE on pending page is just one of the sources, Linux
> > just wants to disable this kind of #VE since it wants to prevent unexpected
> > #VE during SYSCALL gap.
> 
> Linux guests can't disable those on their own?  Requiring this being
> configured on the host looks rather fragile to me ...

Guest can get the attributes. (But can't change it).  If the attributes isn't
what the guest expects, the guest can stop working itself.
-- 
Isaku Yamahata 



Re: [RFC PATCH v3 17/36] pflash_cfi01/tdx: Introduce ram_mode of pflash for TDVF

2022-03-21 Thread Isaku Yamahata
On Mon, Mar 21, 2022 at 04:54:51PM +0800,
Xiaoyao Li  wrote:

> On 3/18/2022 10:07 PM, Philippe Mathieu-Daudé wrote:
> > Hi,
> > 
> > On 17/3/22 14:58, Xiaoyao Li wrote:
> > > TDX VM needs to boot with Trust Domain Virtual Firmware (TDVF). Unlike
> > > that OVMF is mapped as rom device, TDVF needs to be mapped as private
> > > memory. This is because TDX architecture doesn't provide read-only
> > > capability for VMM, and it doesn't support instruction emulation due
> > > to guest memory and registers are not accessible for VMM.
> > > 
> > > On the other hand, OVMF can work as TDVF, which is usually configured
> > > as pflash device in QEMU. To keep the same usage (QEMU parameter),
> > > introduce ram_mode to pflash for TDVF. When it's creating a TDX VM,
> > > ram_mode will be enabled automatically that map the firmware as RAM.
> > > 
> > > Note, this implies two things:
> > > ?? 1. TDVF (OVMF) is not read-only (write-protected).
> > > 
> > > ?? 2. It doesn't support non-volatile UEFI variables as what pflash
> > >  supports that the change to non-volatile UEFI variables won't get
> > >  synced back to backend vars.fd file.
> > > 
> > > Signed-off-by: Xiaoyao Li 
> > > ---
> > > ?? hw/block/pflash_cfi01.c | 25 ++---
> > > ?? hw/i386/pc_sysfw.c?? | 14 +++---
> > > ?? 2 files changed, 29 insertions(+), 10 deletions(-)
> > 
> > If you don't need a pflash device, don't use it: simply map your nvram
> > region as ram in your machine. No need to clutter the pflash model like
> > that.
> 
> I know it's dirty to hack the pflash device. The purpose is to make the user
> interface unchanged that people can still use
> 
>   -drive if=pflash,format=raw,unit=0,file=/path/to/OVMF_CODE.fd
> -drive if=pflash,format=raw,unit=1,file=/path/to/OVMF_VARS.fd
> 
> to create TD guest.

For the compatibility for qemu command line, you don't have to modify pflash
device.  Don't instantiate pflash at pc_system_flash_create(), and at
pc_system_firmware_init(), you can retrieve necessary parameters, and then
populate memory.  Although it's still hacky, it would be cleaner a bit.
-- 
Isaku Yamahata 



Re: [RFC PATCH v3 33/36] i386/tdx: Only configure MSR_IA32_UCODE_REV in kvm_init_msrs() for TDs

2022-03-18 Thread Isaku Yamahata
On Thu, Mar 17, 2022 at 09:59:10PM +0800,
Xiaoyao Li  wrote:

> For TDs, only MSR_IA32_UCODE_REV in kvm_init_msrs() can be configured
> by VMM, while the features enumerated/controlled by other MSRs except
> MSR_IA32_UCODE_REV in kvm_init_msrs() are not under control of VMM.
> 
> Only configure MSR_IA32_UCODE_REV for TDs.

non-TDs?
-- 
Isaku Yamahata 



Re: [RFC PATCH v3 18/36] i386/tdvf: Introduce function to parse TDVF metadata

2022-03-18 Thread Isaku Yamahata
On Thu, Mar 17, 2022 at 09:58:55PM +0800,
Xiaoyao Li  wrote:

> diff --git a/hw/i386/tdvf.c b/hw/i386/tdvf.c
> new file mode 100644
> index ..02da1d2c12dd
> --- /dev/null
> +++ b/hw/i386/tdvf.c
> @@ -0,0 +1,196 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0-or-later
> +
> + * Copyright (c) 2020 Intel Corporation
> + * Author: Isaku Yamahata 
> + *
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "hw/i386/pc.h"
> +#include "hw/i386/tdvf.h"
> +#include "sysemu/kvm.h"
> +
> +#define TDX_METADATA_GUID "e47a6535-984a-4798-865e-4685a7bf8ec2"
> +#define TDX_METADATA_VERSION1
> +#define TDVF_SIGNATURE_LE32 0x46564454 /* TDVF as little endian */

_LE32 doesn't make sense.  qemu doesn't provide macro version for byteswap.
Let's convert at the usage point.


> +
> +typedef struct {
> +uint32_t DataOffset;
> +uint32_t RawDataSize;
> +uint64_t MemoryAddress;
> +uint64_t MemoryDataSize;
> +uint32_t Type;
> +uint32_t Attributes;
> +} TdvfSectionEntry;
> +
> +typedef struct {
> +uint32_t Signature;
> +uint32_t Length;
> +uint32_t Version;
> +uint32_t NumberOfSectionEntries;
> +TdvfSectionEntry SectionEntries[];
> +} TdvfMetadata;
> +
> +struct tdx_metadata_offset {
> +uint32_t offset;
> +};
> +
> +static TdvfMetadata *tdvf_get_metadata(void *flash_ptr, int size)
> +{
> +TdvfMetadata *metadata;
> +uint32_t offset = 0;
> +uint8_t *data;
> +
> +if ((uint32_t) size != size) {
> +return NULL;
> +}
> +
> +if (pc_system_ovmf_table_find(TDX_METADATA_GUID, , NULL)) {
> +offset = size - le32_to_cpu(((struct tdx_metadata_offset 
> *)data)->offset);
> +
> +if (offset + sizeof(*metadata) > size) {
> +return NULL;
> +}
> +} else {
> +error_report("Cannot find TDX_METADATA_GUID\n");
> +return NULL;
> +    }
> +
> +metadata = flash_ptr + offset;
> +
> +/* Finally, verify the signature to determine if this is a TDVF image. */
> +   if (metadata->Signature != TDVF_SIGNATURE_LE32) {


metadata->Signature = le32_to_cpu(metadata->Signature);
metadata->Signature != TDVF_SIGNATURE for consistency.

-- 
Isaku Yamahata 



Re: [RFC PATCH v3 16/36] i386/tdx: Set kvm_readonly_mem_enabled to false for TDX VM

2022-03-18 Thread Isaku Yamahata
On Thu, Mar 17, 2022 at 09:58:53PM +0800,
Xiaoyao Li  wrote:

> TDX only supports readonly for shared memory but not for private memory.
> 
> In the view of QEMU, it has no idea whether a memslot is used by shared
> memory of private. Thus just mark kvm_readonly_mem_enabled to false to
> TDX VM for simplicity.
> 
> Note, pflash has dependency on readonly capability from KVM while TDX
> wants to reuse pflash interface to load TDVF (as OVMF). Excuse TDX VM
> for readonly check in pflash.
> 
> Signed-off-by: Xiaoyao Li 
> ---
>  hw/i386/pc_sysfw.c| 2 +-
>  target/i386/kvm/tdx.c | 9 +
>  2 files changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
> index c8b17af95353..75b34d02cb4f 100644
> --- a/hw/i386/pc_sysfw.c
> +++ b/hw/i386/pc_sysfw.c
> @@ -245,7 +245,7 @@ void pc_system_firmware_init(PCMachineState *pcms,
>  /* Machine property pflash0 not set, use ROM mode */
>  x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, false);
>  } else {
> -if (kvm_enabled() && !kvm_readonly_mem_enabled()) {
> +if (kvm_enabled() && (!kvm_readonly_mem_enabled() && !is_tdx_vm())) {

Is this called before tdx_kvm_init()?

Thanks,


>  /*
>   * Older KVM cannot execute from device memory. So, flash
>   * memory cannot be used unless the readonly memory kvm
> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index 94a9c1ea7e9c..1bb8211e74e6 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -115,6 +115,15 @@ int tdx_kvm_init(MachineState *ms, Error **errp)
>  get_tdx_capabilities();
>  }
>  
> +/*
> + * Set kvm_readonly_mem_allowed to false, because TDX only supports 
> readonly
> + * memory for shared memory but not for private memory. Besides, whether 
> a
> + * memslot is private or shared is not determined by QEMU.
> + *
> + * Thus, just mark readonly memory not supported for simplicity.
> + */
> +kvm_readonly_mem_allowed = false;
> +
>  tdx_guest = tdx;
>  
>  return 0;
> -- 
> 2.27.0
> 
> 

-- 
Isaku Yamahata 



Re: [RFC PATCH v3 09/36] KVM: Introduce kvm_arch_pre_create_vcpu()

2022-03-18 Thread Isaku Yamahata
On Thu, Mar 17, 2022 at 09:58:46PM +0800,
Xiaoyao Li  wrote:

> Introduce kvm_arch_pre_create_vcpu(), to perform arch-dependent
> work prior to create any vcpu. This is for i386 TDX because it needs
> call TDX_INIT_VM before creating any vcpu.
> 
> Signed-off-by: Xiaoyao Li 
> ---
>  accel/kvm/kvm-all.c| 7 +++
>  include/sysemu/kvm.h   | 1 +
>  target/arm/kvm64.c | 5 +
>  target/i386/kvm/kvm.c  | 5 +
>  target/mips/kvm.c  | 5 +
>  target/ppc/kvm.c   | 5 +
>  target/s390x/kvm/kvm.c | 5 +
>  7 files changed, 33 insertions(+)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 27864dfaeaaa..a4bb449737a6 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -465,6 +465,13 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
>  
>  trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
>  
> +ret = kvm_arch_pre_create_vcpu(cpu);
> +if (ret < 0) {
> +error_setg_errno(errp, -ret,
> + "kvm_init_vcpu: kvm_arch_pre_create_vcpu() failed");
> +goto err;
> +}
> +
>  ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
>  if (ret < 0) {
>  error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed 
> (%lu)",
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index a783c7886811..0e94031ab7c7 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -373,6 +373,7 @@ int kvm_arch_put_registers(CPUState *cpu, int level);
>  
>  int kvm_arch_init(MachineState *ms, KVMState *s);
>  
> +int kvm_arch_pre_create_vcpu(CPUState *cpu);
>  int kvm_arch_init_vcpu(CPUState *cpu);
>  int kvm_arch_destroy_vcpu(CPUState *cpu);
>  
> diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
> index ccadfbbe72be..ae7336851c62 100644
> --- a/target/arm/kvm64.c
> +++ b/target/arm/kvm64.c
> @@ -935,6 +935,11 @@ int kvm_arch_init_vcpu(CPUState *cs)
>  return kvm_arm_init_cpreg_list(cpu);
>  }
>  
> +int kvm_arch_pre_create_vcpu(CPUState *cpu)
> +{
> +return 0;
> +}
> +

Weak symbol can be used to avoid update all the arch.

Thanks,
-- 
Isaku Yamahata 



Re: [RFC PATCH v3 08/36] i386/tdx: Adjust get_supported_cpuid() for TDX VM

2022-03-18 Thread Isaku Yamahata
On Thu, Mar 17, 2022 at 09:58:45PM +0800,
Xiaoyao Li  wrote:

> For TDX, the allowable CPUID configuration differs from what KVM
> reports for KVM scope via KVM_GET_SUPPORTED_CPUID.
> 
> - Some CPUID bits are not supported for TDX VM while KVM reports the
>   support. Mask them off for TDX VM. e.g., CPUID_EXT_VMX, some PV
>   featues.
> 
> - The supported XCR0 and XSS bits needs to be caped by tdx_caps, because
>   KVM uses them to setup XFAM of TD.
> 
> Introduce tdx_get_supported_cpuid() to adjust the
> kvm_arch_get_supported_cpuid() for TDX VM.
> 
> Signed-off-by: Xiaoyao Li 
> ---
>  target/i386/cpu.h |  5 +
>  target/i386/kvm/kvm.c |  4 
>  target/i386/kvm/tdx.c | 39 +++
>  target/i386/kvm/tdx.h |  2 ++
>  4 files changed, 50 insertions(+)
> 
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 5e406088a91a..7fa30f4ed7db 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -566,6 +566,11 @@ typedef enum X86Seg {
>  #define ESA_FEATURE_XFD_MASK(1U << ESA_FEATURE_XFD_BIT)
>  
>  
> +#define XCR0_MASK   (XSTATE_FP_MASK | XSTATE_SSE_MASK | XSTATE_YMM_MASK 
> | \
> + XSTATE_BNDREGS_MASK | XSTATE_BNDCSR_MASK | \
> + XSTATE_OPMASK_MASK | XSTATE_ZMM_Hi256_MASK | \
> + XSTATE_Hi16_ZMM_MASK | XSTATE_PKRU_MASK)
> +
>  /* CPUID feature words */
>  typedef enum FeatureWord {
>  FEAT_1_EDX, /* CPUID[1].EDX */
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 26ed5faf07b8..ddbe8f64fadb 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -486,6 +486,10 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, 
> uint32_t function,
>  ret |= 1U << KVM_HINTS_REALTIME;
>  }
>  
> +if (is_tdx_vm()) {
> +tdx_get_supported_cpuid(function, index, reg, );
> +}
> +
>  return ret;
>  }
>  
> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index 846511b299f4..e4ee55f30c79 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -14,6 +14,7 @@
>  #include "qemu/osdep.h"
>  #include "qapi/error.h"
>  #include "qom/object_interfaces.h"
> +#include "standard-headers/asm-x86/kvm_para.h"
>  #include "sysemu/kvm.h"
>  
>  #include "hw/i386/x86.h"
> @@ -110,6 +111,44 @@ int tdx_kvm_init(MachineState *ms, Error **errp)
>  return 0;
>  }
>  
> +void tdx_get_supported_cpuid(uint32_t function, uint32_t index, int reg,
> + uint32_t *ret)
> +{
> +switch (function) {
> +case 1:
> +if (reg == R_ECX) {
> +*ret &= ~CPUID_EXT_VMX;
> +}
> +break;
> +case 0xd:
> +if (index == 0) {
> +if (reg == R_EAX) {
> +*ret &= (uint32_t)tdx_caps->xfam_fixed0 & XCR0_MASK;
> +*ret |= (uint32_t)tdx_caps->xfam_fixed1 & XCR0_MASK;
> +} else if (reg == R_EDX) {
> +*ret &= (tdx_caps->xfam_fixed0 & XCR0_MASK) >> 32;
> +*ret |= (tdx_caps->xfam_fixed1 & XCR0_MASK) >> 32;
> +}
> +} else if (index == 1) {
> +/* TODO: Adjust XSS when it's supported. */
> +}
> +break;
> +case KVM_CPUID_FEATURES:
> +if (reg == R_EAX) {
> +*ret &= ~((1ULL << KVM_FEATURE_CLOCKSOURCE) |
> +  (1ULL << KVM_FEATURE_CLOCKSOURCE2) |
> +  (1ULL << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
> +  (1ULL << KVM_FEATURE_ASYNC_PF) |
> +  (1ULL << KVM_FEATURE_ASYNC_PF_VMEXIT) |
> +  (1ULL << KVM_FEATURE_ASYNC_PF_INT));

Because new feature bit may be introduced in future (it's unlikely though),
*ret &= (supported_bits) is better than *ret &= ~(unsupported_bits)

Thanks,

> +}
> +break;
> +default:
> +/* TODO: Use tdx_caps to adjust CPUID leafs. */
> +break;
> +}
> +}
> +
>  /* tdx guest */
>  OBJECT_DEFINE_TYPE_WITH_INTERFACES(TdxGuest,
>     tdx_guest,
> diff --git a/target/i386/kvm/tdx.h b/target/i386/kvm/tdx.h
> index 4036ca2f3f99..06599b65b827 100644
> --- a/target/i386/kvm/tdx.h
> +++ b/target/i386/kvm/tdx.h
> @@ -27,5 +27,7 @@ bool is_tdx_vm(void);
>  #endif /* CONFIG_TDX */
>  
>  int tdx_kvm_init(MachineState *ms, Error **errp);
> +void tdx_get_supported_cpuid(uint32_t function, uint32_t index, int reg,
> + uint32_t *ret);
>  
>  #endif /* QEMU_I386_TDX_H */
> -- 
> 2.27.0
> 
> 

-- 
Isaku Yamahata 



Re: [RFC PATCH v3 06/36] i386/tdx: Get tdx_capabilities via KVM_TDX_CAPABILITIES

2022-03-17 Thread Isaku Yamahata
On Thu, Mar 17, 2022 at 09:58:43PM +0800,
Xiaoyao Li  wrote:

> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index e3b94373b316..bed337e5ba18 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -14,10 +14,77 @@
>  #include "qemu/osdep.h"
>  #include "qapi/error.h"
>  #include "qom/object_interfaces.h"
> +#include "sysemu/kvm.h"
>  
>  #include "hw/i386/x86.h"
>  #include "tdx.h"
>  
> +enum tdx_ioctl_level{
> +TDX_VM_IOCTL,
> +TDX_VCPU_IOCTL,
> +};
> +
> +static int __tdx_ioctl(void *state, enum tdx_ioctl_level level, int cmd_id,
> +__u32 metadata, void *data)
> +{
> +struct kvm_tdx_cmd tdx_cmd;
> +int r;
> +
> +memset(_cmd, 0x0, sizeof(tdx_cmd));
> +
> +tdx_cmd.id = cmd_id;
> +tdx_cmd.metadata = metadata;
> +tdx_cmd.data = (__u64)(unsigned long)data;
> +
> +switch (level) {
> +case TDX_VM_IOCTL:
> +r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, _cmd);
> +break;
> +case TDX_VCPU_IOCTL:
> +r = kvm_vcpu_ioctl(state, KVM_MEMORY_ENCRYPT_OP, _cmd);
> +break;
> +default:
> +error_report("Invalid tdx_ioctl_level %d", level);
> +exit(1);
> +}
> +
> +return r;
> +}
> +
> +#define tdx_vm_ioctl(cmd_id, metadata, data) \
> +__tdx_ioctl(NULL, TDX_VM_IOCTL, cmd_id, metadata, data)
> +
> +#define tdx_vcpu_ioctl(cpu, cmd_id, metadata, data) \
> +__tdx_ioctl(cpu, TDX_VCPU_IOCTL, cmd_id, metadata, data)

No point to use macro.  Normal (inline) function can works.

-- 
Isaku Yamahata 



Re: [RFC PATCH v3 05/36] i386/tdx: Implement tdx_kvm_init() to initialize TDX VM context

2022-03-17 Thread Isaku Yamahata
On Thu, Mar 17, 2022 at 09:58:42PM +0800,
Xiaoyao Li  wrote:

> Introduce tdx_kvm_init() and invoke it in kvm_confidential_guest_init()
> if it's a TDX VM. More initialization will be added later.
> 
> Signed-off-by: Xiaoyao Li 
> ---
>  target/i386/kvm/kvm.c   | 15 ++-
>  target/i386/kvm/meson.build |  2 +-
>  target/i386/kvm/tdx-stub.c  |  9 +
>  target/i386/kvm/tdx.c   | 13 +
>  target/i386/kvm/tdx.h   |  2 ++
>  5 files changed, 31 insertions(+), 10 deletions(-)
>  create mode 100644 target/i386/kvm/tdx-stub.c
> 
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 70454355f3bf..26ed5faf07b8 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -54,6 +54,7 @@
>  #include "migration/blocker.h"
>  #include "exec/memattrs.h"
>  #include "trace.h"
> +#include "tdx.h"
>  
>  //#define DEBUG_KVM
>  
> @@ -2360,6 +2361,8 @@ static int kvm_confidential_guest_init(MachineState 
> *ms, Error **errp)
>  {
>  if (object_dynamic_cast(OBJECT(ms->cgs), TYPE_SEV_GUEST)) {
>  return sev_kvm_init(ms->cgs, errp);
> +} else if (object_dynamic_cast(OBJECT(ms->cgs), TYPE_TDX_GUEST)) {
> +return tdx_kvm_init(ms, errp);
>  }
>  
>  return 0;
> @@ -2374,16 +2377,10 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>  Error *local_err = NULL;
>  
>  /*
> - * Initialize SEV context, if required
> + * Initialize confidential guest (SEV/TDX) context, if required
>   *
> - * If no memory encryption is requested (ms->cgs == NULL) this is
> - * a no-op.
> - *
> - * It's also a no-op if a non-SEV confidential guest support
> - * mechanism is selected.  SEV is the only mechanism available to
> - * select on x86 at present, so this doesn't arise, but if new
> - * mechanisms are supported in future (e.g. TDX), they'll need
> - * their own initialization either here or elsewhere.
> + * It's a no-op if a non-SEV/non-tdx confidential guest support
> + * mechanism is selected, i.e., ms->cgs == NULL
>   */
>  ret = kvm_confidential_guest_init(ms, _err);
>  if (ret < 0) {
> diff --git a/target/i386/kvm/meson.build b/target/i386/kvm/meson.build
> index b2d7d41acde2..fd30b93ecec9 100644
> --- a/target/i386/kvm/meson.build
> +++ b/target/i386/kvm/meson.build
> @@ -9,7 +9,7 @@ i386_softmmu_kvm_ss.add(files(
>  
>  i386_softmmu_kvm_ss.add(when: 'CONFIG_SEV', if_false: files('sev-stub.c'))
>  
> -i386_softmmu_kvm_ss.add(when: 'CONFIG_TDX', if_true: files('tdx.c'))
> +i386_softmmu_kvm_ss.add(when: 'CONFIG_TDX', if_true: files('tdx.c'), 
> if_false: files('tdx-stub.c'))
>  
>  i386_softmmu_ss.add(when: 'CONFIG_HYPERV', if_true: files('hyperv.c'), 
> if_false: files('hyperv-stub.c'))
>  
> diff --git a/target/i386/kvm/tdx-stub.c b/target/i386/kvm/tdx-stub.c
> new file mode 100644
> index ..1df24735201e
> --- /dev/null
> +++ b/target/i386/kvm/tdx-stub.c
> @@ -0,0 +1,9 @@
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +
> +#include "tdx.h"
> +
> +int tdx_kvm_init(MachineState *ms, Error **errp)
> +{
> +return -EINVAL;
> +}
> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index d3792d4a3d56..e3b94373b316 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -12,10 +12,23 @@
>   */
>  
>  #include "qemu/osdep.h"
> +#include "qapi/error.h"
>  #include "qom/object_interfaces.h"
>  
> +#include "hw/i386/x86.h"
>  #include "tdx.h"
>  
> +int tdx_kvm_init(MachineState *ms, Error **errp)
> +{
> +TdxGuest *tdx = (TdxGuest *)object_dynamic_cast(OBJECT(ms->cgs),
> +TYPE_TDX_GUEST);

The caller already checks it.  This is redundant. Maybe assert?


-- 
Isaku Yamahata 



[RFC PATCH v2 26/44] pci-host/q35: Move PAM initialization above SMRAM initialization

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

In mch_realize(), process PAM initialization before SMRAM initialization so
that later patch can skill all the SMRAM related with a single check.

Signed-off-by: Isaku Yamahata 
---
 hw/pci-host/q35.c | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index 9a2be237d7..68234d209c 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -571,6 +571,16 @@ static void mch_realize(PCIDevice *d, Error **errp)
 pc_pci_as_mapping_init(OBJECT(mch), mch->system_memory,
mch->pci_address_space);
 
+/* PAM */
+init_pam(DEVICE(mch), mch->ram_memory, mch->system_memory,
+ mch->pci_address_space, >pam_regions[0],
+ PAM_BIOS_BASE, PAM_BIOS_SIZE);
+for (i = 0; i < ARRAY_SIZE(mch->pam_regions) - 1; ++i) {
+init_pam(DEVICE(mch), mch->ram_memory, mch->system_memory,
+ mch->pci_address_space, >pam_regions[i+1],
+ PAM_EXPAN_BASE + i * PAM_EXPAN_SIZE, PAM_EXPAN_SIZE);
+}
+
 /* if *disabled* show SMRAM to all CPUs */
 memory_region_init_alias(>smram_region, OBJECT(mch), "smram-region",
  mch->pci_address_space, 
MCH_HOST_BRIDGE_SMRAM_C_BASE,
@@ -637,15 +647,6 @@ static void mch_realize(PCIDevice *d, Error **errp)
 
 object_property_add_const_link(qdev_get_machine(), "smram",
OBJECT(>smram));
-
-init_pam(DEVICE(mch), mch->ram_memory, mch->system_memory,
- mch->pci_address_space, >pam_regions[0],
- PAM_BIOS_BASE, PAM_BIOS_SIZE);
-for (i = 0; i < ARRAY_SIZE(mch->pam_regions) - 1; ++i) {
-init_pam(DEVICE(mch), mch->ram_memory, mch->system_memory,
- mch->pci_address_space, >pam_regions[i+1],
- PAM_EXPAN_BASE + i * PAM_EXPAN_SIZE, PAM_EXPAN_SIZE);
-}
 }
 
 uint64_t mch_mcfg_base(void)
-- 
2.25.1




[RFC PATCH v2 23/44] i386/tdx: Use KVM_TDX_INIT_VCPU to pass HOB to TDVF

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Specify the initial value for RCX/R8 to be the address of the HOB.
Don't propagate the value to Qemu's cache of the registers so as to
avoid implying that the register state is valid, e.g. Qemu doesn't model
TDX-SEAM behavior for initializing other GPRs.

Signed-off-by: Isaku Yamahata 
---
 target/i386/kvm/tdx.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index 0cd649dd01..c348626dbf 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -285,10 +285,17 @@ out:
 
 void tdx_post_init_vcpu(CPUState *cpu)
 {
-CPUX86State *env = _CPU(cpu)->env;
+MachineState *ms = MACHINE(qdev_get_machine());
+TdxGuest *tdx = (TdxGuest *)object_dynamic_cast(OBJECT(ms->cgs),
+TYPE_TDX_GUEST);
+TdxFirmwareEntry *hob;
+
+if (!tdx) {
+return;
+}
 
-_tdx_ioctl(cpu, KVM_TDX_INIT_VCPU, 0,
-   (void *)(unsigned long)env->regs[R_ECX]);
+hob = tdx_get_hob_entry(tdx);
+_tdx_ioctl(cpu, KVM_TDX_INIT_VCPU, 0, (void *)hob->address);
 }
 
 static bool tdx_guest_get_debug(Object *obj, Error **errp)
-- 
2.25.1




[RFC PATCH v2 43/44] i386/tdx: disallow level interrupt and SMI/INIT/SIPI delivery mode

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

TDX doesn't allow level interrupt and SMI/INIT/SIPI interrupt delivery
mode.  So disallow them.

Signed-off-by: Isaku Yamahata 
---
 hw/i386/x86.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index 24af05c313..c372403b87 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -1307,6 +1307,9 @@ static int x86_kvm_type(MachineState *ms, const char 
*vm_type)
 kvm_type = KVM_X86_LEGACY_VM;
 } else if (!g_ascii_strcasecmp(vm_type, "tdx")) {
 kvm_type = KVM_X86_TDX_VM;
+X86_MACHINE(ms)->eoi_intercept_unsupported = true;
+X86_MACHINE(ms)->smi_unsupported = true;
+X86_MACHINE(ms)->init_sipi_unsupported = true;
 } else {
 error_report("Unknown kvm-type specified '%s'", vm_type);
 exit(1);
-- 
2.25.1




[RFC PATCH v2 28/44] i386/tdx: Force x2apic mode and routing for TDs

2021-07-07 Thread isaku . yamahata
From: Sean Christopherson 

TDX requires x2apic and "resets" vCPUs to have x2apic enabled.  Model
this in QEMU and unconditionally enable x2apic interrupt routing.

This fixes issues where interrupts from IRQFD would not get forwarded to
the guest due to KVM silently dropping the invalid routing entry.

Signed-off-by: Sean Christopherson 
Signed-off-by: Isaku Yamahata 
---
 hw/intc/apic_common.c   | 12 
 include/hw/i386/apic.h  |  1 +
 include/hw/i386/apic_internal.h |  1 +
 target/i386/kvm/tdx.c   |  7 +++
 4 files changed, 21 insertions(+)

diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c
index 2a20982066..b95fed95da 100644
--- a/hw/intc/apic_common.c
+++ b/hw/intc/apic_common.c
@@ -262,6 +262,15 @@ void apic_designate_bsp(DeviceState *dev, bool bsp)
 }
 }
 
+void apic_force_x2apic(DeviceState *dev)
+{
+if (dev == NULL) {
+return;
+}
+
+APIC_COMMON(dev)->force_x2apic = true;
+}
+
 static void apic_reset_common(DeviceState *dev)
 {
 APICCommonState *s = APIC_COMMON(dev);
@@ -270,6 +279,9 @@ static void apic_reset_common(DeviceState *dev)
 
 bsp = s->apicbase & MSR_IA32_APICBASE_BSP;
 s->apicbase = APIC_DEFAULT_ADDRESS | bsp | MSR_IA32_APICBASE_ENABLE;
+if (s->force_x2apic) {
+s->apicbase |= MSR_IA32_APICBASE_EXTD;
+}
 s->id = s->initial_apic_id;
 
 apic_reset_irq_delivered();
diff --git a/include/hw/i386/apic.h b/include/hw/i386/apic.h
index da1d2fe155..7d05abd7e0 100644
--- a/include/hw/i386/apic.h
+++ b/include/hw/i386/apic.h
@@ -19,6 +19,7 @@ void apic_init_reset(DeviceState *s);
 void apic_sipi(DeviceState *s);
 void apic_poll_irq(DeviceState *d);
 void apic_designate_bsp(DeviceState *d, bool bsp);
+void apic_force_x2apic(DeviceState *d);
 int apic_get_highest_priority_irr(DeviceState *dev);
 
 /* pc.c */
diff --git a/include/hw/i386/apic_internal.h b/include/hw/i386/apic_internal.h
index c175e7e718..eda0b5a587 100644
--- a/include/hw/i386/apic_internal.h
+++ b/include/hw/i386/apic_internal.h
@@ -187,6 +187,7 @@ struct APICCommonState {
 DeviceState *vapic;
 hwaddr vapic_paddr; /* note: persistence via kvmvapic */
 bool legacy_instance_id;
+bool force_x2apic;
 };
 
 typedef struct VAPICState {
diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index c348626dbf..47a502051c 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -139,6 +139,11 @@ int tdx_kvm_init(ConfidentialGuestSupport *cgs, Error 
**errp)
 tdx_caps->nr_cpuid_configs = TDX1_MAX_NR_CPUID_CONFIGS;
 tdx_ioctl(KVM_TDX_CAPABILITIES, 0, tdx_caps);
 
+if (!kvm_enable_x2apic()) {
+error_report("Failed to enable x2apic in KVM");
+exit(1);
+}
+
 qemu_add_machine_init_done_late_notifier(_machine_done_late_notify);
 
 return 0;
@@ -296,6 +301,8 @@ void tdx_post_init_vcpu(CPUState *cpu)
 
 hob = tdx_get_hob_entry(tdx);
 _tdx_ioctl(cpu, KVM_TDX_INIT_VCPU, 0, (void *)hob->address);
+
+apic_force_x2apic(X86_CPU(cpu)->apic_state);
 }
 
 static bool tdx_guest_get_debug(Object *obj, Error **errp)
-- 
2.25.1




[RFC PATCH v2 42/44] hw/i386: add a flag to disable init/sipi delivery mode of interrupt

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Add a new flag to X86Machine to disallow INIT/SIPI delivery mode of
interrupt and pass it to ioapic creation so that ioapic disallows INIT/SIPI
delivery mode.

Signed-off-by: Isaku Yamahata 
---
 hw/i386/microvm.c |  4 ++--
 hw/i386/pc_piix.c |  2 +-
 hw/i386/pc_q35.c  |  2 +-
 hw/i386/x86.c | 11 +--
 include/hw/i386/x86.h |  7 +--
 5 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
index 7504324891..c790adecfb 100644
--- a/hw/i386/microvm.c
+++ b/hw/i386/microvm.c
@@ -176,11 +176,11 @@ static void microvm_devices_init(MicrovmMachineState *mms)
 isa_bus_irqs(isa_bus, x86ms->gsi);
 
 ioapic_init_gsi(gsi_state, "machine", x86ms->eoi_intercept_unsupported,
-x86ms->smi_unsupported);
+x86ms->smi_unsupported, x86ms->init_sipi_unsupported);
 if (ioapics > 1) {
 x86ms->ioapic2 = ioapic_init_secondary(
 gsi_state, x86ms->eoi_intercept_unsupported,
-x86ms->smi_unsupported);
+x86ms->smi_unsupported, x86ms->init_sipi_unsupported);
 }
 
 kvmclock_create(true);
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 0958035bf8..940cd0f47b 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -224,7 +224,7 @@ static void pc_init1(MachineState *machine,
 
 if (pcmc->pci_enabled) {
 ioapic_init_gsi(gsi_state, "i440fx", x86ms->eoi_intercept_unsupported,
-x86ms->smi_unsupported);
+x86ms->smi_unsupported, x86ms->init_sipi_unsupported);
 }
 
 if (tcg_enabled()) {
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 1ab8a6a78b..8f677ec136 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -257,7 +257,7 @@ static void pc_q35_init(MachineState *machine)
 
 if (pcmc->pci_enabled) {
 ioapic_init_gsi(gsi_state, "q35", x86ms->eoi_intercept_unsupported,
-x86ms->smi_unsupported);
+x86ms->smi_unsupported, x86ms->init_sipi_unsupported);
 }
 
 if (tcg_enabled()) {
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index 3dc36e3590..24af05c313 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -610,7 +610,8 @@ void gsi_handler(void *opaque, int n, int level)
 
 void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name,
  bool level_trigger_unsupported,
- bool smi_unsupported)
+ bool smi_unsupported,
+ bool init_sipi_unsupported)
 {
 DeviceState *dev;
 SysBusDevice *d;
@@ -628,6 +629,8 @@ void ioapic_init_gsi(GSIState *gsi_state, const char 
*parent_name,
  level_trigger_unsupported, NULL);
 object_property_set_bool(OBJECT(dev), "smi_unsupported",
  smi_unsupported, NULL);
+object_property_set_bool(OBJECT(dev), "init_sipi_unsupported",
+ init_sipi_unsupported, NULL);
 d = SYS_BUS_DEVICE(dev);
 sysbus_realize_and_unref(d, _fatal);
 sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS);
@@ -639,7 +642,8 @@ void ioapic_init_gsi(GSIState *gsi_state, const char 
*parent_name,
 
 DeviceState *ioapic_init_secondary(GSIState *gsi_state,
bool level_trigger_unsupported,
-   bool smi_unsupported)
+   bool smi_unsupported,
+   bool init_sipi_unsupported)
 {
 DeviceState *dev;
 SysBusDevice *d;
@@ -650,6 +654,8 @@ DeviceState *ioapic_init_secondary(GSIState *gsi_state,
  level_trigger_unsupported, NULL);
 object_property_set_bool(OBJECT(dev), "smi_unsupported",
  smi_unsupported, NULL);
+object_property_set_bool(OBJECT(dev), "init_sipi_unsupported",
+ init_sipi_unsupported, NULL);
 d = SYS_BUS_DEVICE(dev);
 sysbus_realize_and_unref(d, _fatal);
 sysbus_mmio_map(d, 0, IO_APIC_SECONDARY_ADDRESS);
@@ -1325,6 +1331,7 @@ static void x86_machine_initfn(Object *obj)
 x86ms->bus_lock_ratelimit = 0;
 x86ms->eoi_intercept_unsupported = false;
 x86ms->smi_unsupported = false;
+x86ms->init_sipi_unsupported = false;
 
 object_property_add_str(obj, "kvm-type",
 x86_get_kvm_type, x86_set_kvm_type);
diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h
index 3d1d74d171..bca8c2b57d 100644
--- a/include/hw/i386/x86.h
+++ b/include/hw/i386/x86.h
@@ -65,6 +65,7 @@ struct X86MachineState {
 uint16_t boot_cpus;
 bool eoi_intercept_unsupported;
 bool smi_unsupported;
+bool init_sipi_unsupported;
 
 OnOffAuto smm;
 OnOffAuto acpi;
@@ -143,9 +144,11 @@ qemu_irq x86_allocate_cpu_i

[RFC PATCH v2 27/44] q35: Introduce smm_ranges property for q35-pci-host

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Add a q35 property to check whether or not SMM ranges, e.g. SMRAM, TSEG,
etc... exist for the target platform.  TDX doesn't support SMM and doesn't
play nice with QEMU modifying related guest memory ranges.

Signed-off-by: Isaku Yamahata 
Co-developed-by: Sean Christopherson 
Signed-off-by: Sean Christopherson 
Signed-off-by: Isaku Yamahata 
---
 hw/i386/pc_q35.c  |  2 ++
 hw/pci-host/q35.c | 42 +++
 include/hw/i386/pc.h  |  1 +
 include/hw/pci-host/q35.h |  1 +
 4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 46a0f196f4..1718aa94d9 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -219,6 +219,8 @@ static void pc_q35_init(MachineState *machine)
 x86ms->below_4g_mem_size, NULL);
 object_property_set_int(OBJECT(q35_host), PCI_HOST_ABOVE_4G_MEM_SIZE,
 x86ms->above_4g_mem_size, NULL);
+object_property_set_bool(OBJECT(q35_host), PCI_HOST_PROP_SMM_RANGES,
+ x86_machine_is_smm_enabled(x86ms), NULL);
 /* pci */
 sysbus_realize_and_unref(SYS_BUS_DEVICE(q35_host), _fatal);
 phb = PCI_HOST_BRIDGE(q35_host);
diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index 68234d209c..ba28d969ba 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -183,6 +183,8 @@ static Property q35_host_props[] = {
  mch.below_4g_mem_size, 0),
 DEFINE_PROP_SIZE(PCI_HOST_ABOVE_4G_MEM_SIZE, Q35PCIHost,
  mch.above_4g_mem_size, 0),
+DEFINE_PROP_BOOL(PCI_HOST_PROP_SMM_RANGES, Q35PCIHost,
+ mch.has_smm_ranges, true),
 DEFINE_PROP_BOOL("x-pci-hole64-fix", Q35PCIHost, pci_hole64_fix, true),
 DEFINE_PROP_END_OF_LIST(),
 };
@@ -218,6 +220,7 @@ static void q35_host_initfn(Object *obj)
 /* mch's object_initialize resets the default value, set it again */
 qdev_prop_set_uint64(DEVICE(s), PCI_HOST_PROP_PCI_HOLE64_SIZE,
  Q35_PCI_HOST_HOLE64_SIZE_DEFAULT);
+
 object_property_add(obj, PCI_HOST_PROP_PCI_HOLE_START, "uint32",
 q35_host_get_pci_hole_start,
 NULL, NULL, NULL);
@@ -478,6 +481,10 @@ static void mch_write_config(PCIDevice *d,
 mch_update_pam(mch);
 }
 
+if (!mch->has_smm_ranges) {
+return;
+}
+
 if (ranges_overlap(address, len, MCH_HOST_BRIDGE_SMRAM,
MCH_HOST_BRIDGE_SMRAM_SIZE)) {
 mch_update_smram(mch);
@@ -496,10 +503,13 @@ static void mch_write_config(PCIDevice *d,
 static void mch_update(MCHPCIState *mch)
 {
 mch_update_pciexbar(mch);
+
 mch_update_pam(mch);
-mch_update_smram(mch);
-mch_update_ext_tseg_mbytes(mch);
-mch_update_smbase_smram(mch);
+if (mch->has_smm_ranges) {
+mch_update_smram(mch);
+mch_update_ext_tseg_mbytes(mch);
+mch_update_smbase_smram(mch);
+}
 
 /*
  * pci hole goes from end-of-low-ram to io-apic.
@@ -540,18 +550,20 @@ static void mch_reset(DeviceState *qdev)
 pci_set_quad(d->config + MCH_HOST_BRIDGE_PCIEXBAR,
  MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT);
 
-d->config[MCH_HOST_BRIDGE_SMRAM] = MCH_HOST_BRIDGE_SMRAM_DEFAULT;
-d->config[MCH_HOST_BRIDGE_ESMRAMC] = MCH_HOST_BRIDGE_ESMRAMC_DEFAULT;
-d->wmask[MCH_HOST_BRIDGE_SMRAM] = MCH_HOST_BRIDGE_SMRAM_WMASK;
-d->wmask[MCH_HOST_BRIDGE_ESMRAMC] = MCH_HOST_BRIDGE_ESMRAMC_WMASK;
+if (mch->has_smm_ranges) {
+d->config[MCH_HOST_BRIDGE_SMRAM] = MCH_HOST_BRIDGE_SMRAM_DEFAULT;
+d->config[MCH_HOST_BRIDGE_ESMRAMC] = MCH_HOST_BRIDGE_ESMRAMC_DEFAULT;
+d->wmask[MCH_HOST_BRIDGE_SMRAM] = MCH_HOST_BRIDGE_SMRAM_WMASK;
+d->wmask[MCH_HOST_BRIDGE_ESMRAMC] = MCH_HOST_BRIDGE_ESMRAMC_WMASK;
 
-if (mch->ext_tseg_mbytes > 0) {
-pci_set_word(d->config + MCH_HOST_BRIDGE_EXT_TSEG_MBYTES,
- MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_QUERY);
-}
+if (mch->ext_tseg_mbytes > 0) {
+pci_set_word(d->config + MCH_HOST_BRIDGE_EXT_TSEG_MBYTES,
+MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_QUERY);
+}
 
-d->config[MCH_HOST_BRIDGE_F_SMBASE] = 0;
-d->wmask[MCH_HOST_BRIDGE_F_SMBASE] = 0xff;
+d->config[MCH_HOST_BRIDGE_F_SMBASE] = 0;
+d->wmask[MCH_HOST_BRIDGE_F_SMBASE] = 0xff;
+}
 
 mch_update(mch);
 }
@@ -581,6 +593,10 @@ static void mch_realize(PCIDevice *d, Error **errp)
  PAM_EXPAN_BASE + i * PAM_EXPAN_SIZE, PAM_EXPAN_SIZE);
 }
 
+if (!mch->has_smm_ranges) {
+return;
+}
+
 /* if *disabled* show SMRAM to all CPUs */
 memory_region_init_alias(>smram_region, OBJECT(mch), "smram-region",
  mch->pci_address_space, 
MCH_HOST_BRIDGE_SMRAM_C_BASE,
diff --git a/i

[RFC PATCH v2 25/44] q35: Move PCIe BAR check above PAM check in mch_write_config()

2021-07-07 Thread isaku . yamahata
From: Sean Christopherson 

Process PCIe BAR before PAM so that a future patch can skip all the SMM
related crud with a single check-and-return.

Signed-off-by: Sean Christopherson 
Signed-off-by: Isaku Yamahata 
---
 hw/pci-host/q35.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index 2eb729dff5..9a2be237d7 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -468,16 +468,16 @@ static void mch_write_config(PCIDevice *d,
 
 pci_default_write_config(d, address, val, len);
 
-if (ranges_overlap(address, len, MCH_HOST_BRIDGE_PAM0,
-   MCH_HOST_BRIDGE_PAM_SIZE)) {
-mch_update_pam(mch);
-}
-
 if (ranges_overlap(address, len, MCH_HOST_BRIDGE_PCIEXBAR,
MCH_HOST_BRIDGE_PCIEXBAR_SIZE)) {
 mch_update_pciexbar(mch);
 }
 
+if (ranges_overlap(address, len, MCH_HOST_BRIDGE_PAM0,
+   MCH_HOST_BRIDGE_PAM_SIZE)) {
+mch_update_pam(mch);
+}
+
 if (ranges_overlap(address, len, MCH_HOST_BRIDGE_SMRAM,
MCH_HOST_BRIDGE_SMRAM_SIZE)) {
 mch_update_smram(mch);
-- 
2.25.1




[RFC PATCH v2 08/44] i386/kvm: Skip KVM_X86_SETUP_MCE for TDX guests

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Despite advertising MCE support to the guest, TDX-SEAM doesn't support
injecting #MCs into the guest.   All of the associated setup is thus
rejected by KVM.

Signed-off-by: Isaku Yamahata 
---
 target/i386/kvm/kvm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 27b64dedc2..c29cb420a1 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -1825,7 +1825,8 @@ int kvm_arch_init_vcpu(CPUState *cs)
 if (((env->cpuid_version >> 8)&0xF) >= 6
 && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
(CPUID_MCE | CPUID_MCA)
-&& kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
+&& kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0
+&& vm_type != KVM_X86_TDX_VM) {
 uint64_t mcg_cap, unsupported_caps;
 int banks;
 int ret;
-- 
2.25.1




[RFC PATCH v2 21/44] i386/tdx: Create the TD HOB list upon machine init done

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Build the TD HOB during machine late initialization, i.e. once guest
memory is fully defined.

Signed-off-by: Isaku Yamahata 
Co-developed-by: Sean Christopherson 
Signed-off-by: Sean Christopherson 
---
 hw/i386/meson.build   |   2 +-
 hw/i386/tdvf-hob.c| 166 ++
 hw/i386/tdvf-hob.h|  20 +
 target/i386/kvm/tdx.c |  19 +
 4 files changed, 206 insertions(+), 1 deletion(-)
 create mode 100644 hw/i386/tdvf-hob.c
 create mode 100644 hw/i386/tdvf-hob.h

diff --git a/hw/i386/meson.build b/hw/i386/meson.build
index 945e805525..8175c3c638 100644
--- a/hw/i386/meson.build
+++ b/hw/i386/meson.build
@@ -24,7 +24,7 @@ i386_ss.add(when: 'CONFIG_PC', if_true: files(
   'pc_sysfw.c',
   'acpi-build.c',
   'port92.c'))
-i386_ss.add(when: 'CONFIG_TDX', if_true: files('tdvf.c'))
+i386_ss.add(when: 'CONFIG_TDX', if_true: files('tdvf.c', 'tdvf-hob.c'))
 
 subdir('kvm')
 subdir('xen')
diff --git a/hw/i386/tdvf-hob.c b/hw/i386/tdvf-hob.c
new file mode 100644
index 00..5e0bf807f7
--- /dev/null
+++ b/hw/i386/tdvf-hob.c
@@ -0,0 +1,166 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+
+ * Copyright (c) 2020 Intel Corporation
+ * Author: Isaku Yamahata 
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "e820_memory_layout.h"
+#include "hw/i386/x86.h"
+#include "sysemu/tdx.h"
+#include "tdvf-hob.h"
+#include "uefi.h"
+
+typedef struct TdvfHob {
+hwaddr hob_addr;
+void *ptr;
+int size;
+
+/* working area */
+void *current;
+void *end;
+} TdvfHob;
+
+static uint64_t tdvf_current_guest_addr(const TdvfHob *hob)
+{
+return hob->hob_addr + (hob->current - hob->ptr);
+}
+
+static void tdvf_align(TdvfHob *hob, size_t align)
+{
+hob->current = QEMU_ALIGN_PTR_UP(hob->current, align);
+}
+
+static void *tdvf_get_area(TdvfHob *hob, uint64_t size)
+{
+void *ret;
+
+if (hob->current + size > hob->end) {
+error_report("TD_HOB overrun, size = 0x%" PRIx64, size);
+exit(1);
+}
+
+ret = hob->current;
+hob->current += size;
+tdvf_align(hob, 8);
+return ret;
+}
+
+static int tdvf_e820_compare(const void *lhs_, const void* rhs_)
+{
+const struct e820_entry *lhs = lhs_;
+const struct e820_entry *rhs = rhs_;
+
+if (lhs->address == rhs->address) {
+return 0;
+}
+if (le64_to_cpu(lhs->address) > le64_to_cpu(rhs->address)) {
+return 1;
+}
+return -1;
+}
+
+static void tdvf_hob_add_memory_resources(TdvfHob *hob)
+{
+EFI_HOB_RESOURCE_DESCRIPTOR *region;
+EFI_RESOURCE_ATTRIBUTE_TYPE attr;
+EFI_RESOURCE_TYPE resource_type;
+
+struct e820_entry *e820_entries, *e820_entry;
+int nr_e820_entries, i;
+
+nr_e820_entries = e820_get_num_entries();
+e820_entries = g_new(struct e820_entry, nr_e820_entries);
+
+/* Copy and sort the e820 tables to add them to the HOB. */
+memcpy(e820_entries, e820_table,
+   nr_e820_entries * sizeof(struct e820_entry));
+qsort(e820_entries, nr_e820_entries, sizeof(struct e820_entry),
+  _e820_compare);
+
+for (i = 0; i < nr_e820_entries; i++) {
+e820_entry = _entries[i];
+
+if (le32_to_cpu(e820_entry->type) == E820_RAM) {
+resource_type = EFI_RESOURCE_SYSTEM_MEMORY;
+attr = EFI_RESOURCE_ATTRIBUTE_TDVF_UNACCEPTED;
+} else {
+resource_type = EFI_RESOURCE_MEMORY_RESERVED;
+attr = EFI_RESOURCE_ATTRIBUTE_TDVF_PRIVATE;
+}
+
+region = tdvf_get_area(hob, sizeof(*region));
+*region = (EFI_HOB_RESOURCE_DESCRIPTOR) {
+.Header = {
+.HobType = EFI_HOB_TYPE_RESOURCE_DESCRIPTOR,
+.HobLength = cpu_to_le16(sizeof(*region)),
+.Reserved = cpu_to_le32(0),
+},
+.Owner = EFI_HOB_OWNER_ZERO,
+.ResourceType = cpu_to_le32(resource_type),
+.ResourceAttribute = cpu_to_le32(attr),
+.PhysicalStart = e820_entry->address,
+.ResourceLength = e820_entry->length,
+};
+}
+
+g_free(e820_entries);
+}
+
+void tdvf_hob_create(TdxGuest *tdx, T

[RFC PATCH v2 24/44] i386/tdx: Add MMIO HOB entries

2021-07-07 Thread isaku . yamahata
From: Sean Christopherson 

Add MMIO HOB entries, which are needed to enumerate legal MMIO ranges to
early TDVF.

Note, the attribute absolutely must include UNCACHEABLE, else TDVF will
effectively consider it a bad HOB entry and ignore it.

Signed-off-by: Sean Christopherson 
Signed-off-by: Isaku Yamahata 
---
 hw/i386/tdvf-hob.c | 69 ++
 hw/i386/tdvf-hob.h |  5 
 2 files changed, 74 insertions(+)

diff --git a/hw/i386/tdvf-hob.c b/hw/i386/tdvf-hob.c
index 5e0bf807f7..60c5ed0e03 100644
--- a/hw/i386/tdvf-hob.c
+++ b/hw/i386/tdvf-hob.c
@@ -22,7 +22,10 @@
 #include "qemu/osdep.h"
 #include "qemu/log.h"
 #include "e820_memory_layout.h"
+#include "hw/i386/pc.h"
 #include "hw/i386/x86.h"
+#include "hw/pci/pci_host.h"
+#include "hw/pci/pcie_host.h"
 #include "sysemu/tdx.h"
 #include "tdvf-hob.h"
 #include "uefi.h"
@@ -62,6 +65,70 @@ static void *tdvf_get_area(TdvfHob *hob, uint64_t size)
 return ret;
 }
 
+static void tdvf_hob_add_mmio_resource(TdvfHob *hob, uint64_t start,
+   uint64_t end)
+{
+EFI_HOB_RESOURCE_DESCRIPTOR *region;
+
+if (!start) {
+return;
+}
+
+region = tdvf_get_area(hob, sizeof(*region));
+*region = (EFI_HOB_RESOURCE_DESCRIPTOR) {
+.Header = {
+.HobType = EFI_HOB_TYPE_RESOURCE_DESCRIPTOR,
+.HobLength = cpu_to_le16(sizeof(*region)),
+.Reserved = cpu_to_le32(0),
+},
+.Owner = EFI_HOB_OWNER_ZERO,
+.ResourceType = cpu_to_le32(EFI_RESOURCE_MEMORY_MAPPED_IO),
+.ResourceAttribute = cpu_to_le32(EFI_RESOURCE_ATTRIBUTE_TDVF_MMIO),
+.PhysicalStart = cpu_to_le64(start),
+.ResourceLength = cpu_to_le64(end - start),
+};
+}
+
+static void tdvf_hob_add_mmio_resources(TdvfHob *hob)
+{
+MachineState *ms = MACHINE(qdev_get_machine());
+X86MachineState *x86ms = X86_MACHINE(ms);
+PCIHostState *pci_host;
+uint64_t start, end;
+uint64_t mcfg_base, mcfg_size;
+Object *host;
+
+/* Effectively PCI hole + other MMIO devices. */
+tdvf_hob_add_mmio_resource(hob, x86ms->below_4g_mem_size,
+   APIC_DEFAULT_ADDRESS);
+
+/* Stolen from acpi_get_i386_pci_host(), there's gotta be an easier way. */
+pci_host = OBJECT_CHECK(PCIHostState,
+object_resolve_path("/machine/i440fx", NULL),
+TYPE_PCI_HOST_BRIDGE);
+if (!pci_host) {
+pci_host = OBJECT_CHECK(PCIHostState,
+object_resolve_path("/machine/q35", NULL),
+TYPE_PCI_HOST_BRIDGE);
+}
+g_assert(pci_host);
+
+host = OBJECT(pci_host);
+
+/* PCI hole above 4gb. */
+start = object_property_get_uint(host, PCI_HOST_PROP_PCI_HOLE64_START,
+ NULL);
+end = object_property_get_uint(host, PCI_HOST_PROP_PCI_HOLE64_END, NULL);
+tdvf_hob_add_mmio_resource(hob, start, end);
+
+/* MMCFG region */
+mcfg_base = object_property_get_uint(host, PCIE_HOST_MCFG_BASE, NULL);
+mcfg_size = object_property_get_uint(host, PCIE_HOST_MCFG_SIZE, NULL);
+if (mcfg_base && mcfg_base != PCIE_BASE_ADDR_UNMAPPED && mcfg_size) {
+tdvf_hob_add_mmio_resource(hob, mcfg_base, mcfg_base + mcfg_size);
+}
+}
+
 static int tdvf_e820_compare(const void *lhs_, const void* rhs_)
 {
 const struct e820_entry *lhs = lhs_;
@@ -156,6 +223,8 @@ void tdvf_hob_create(TdxGuest *tdx, TdxFirmwareEntry 
*hob_entry)
 
 tdvf_hob_add_memory_resources();
 
+tdvf_hob_add_mmio_resources();
+
 last_hob = tdvf_get_area(, sizeof(*last_hob));
 *last_hob =  (EFI_HOB_GENERIC_HEADER) {
 .HobType = EFI_HOB_TYPE_END_OF_HOB_LIST,
diff --git a/hw/i386/tdvf-hob.h b/hw/i386/tdvf-hob.h
index c6c5c1d564..9967dbfe5a 100644
--- a/hw/i386/tdvf-hob.h
+++ b/hw/i386/tdvf-hob.h
@@ -17,4 +17,9 @@ void tdvf_hob_create(TdxGuest *tdx, TdxFirmwareEntry 
*hob_entry);
  EFI_RESOURCE_ATTRIBUTE_INITIALIZED |   \
  EFI_RESOURCE_ATTRIBUTE_UNACCEPTED)
 
+#define EFI_RESOURCE_ATTRIBUTE_TDVF_MMIO\
+(EFI_RESOURCE_ATTRIBUTE_PRESENT |   \
+ EFI_RESOURCE_ATTRIBUTE_INITIALIZED |   \
+ EFI_RESOURCE_ATTRIBUTE_UNCACHEABLE)
+
 #endif
-- 
2.25.1




[RFC PATCH v2 22/44] i386/tdx: Add TDVF memory via INIT_MEM_REGION

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Add, and optionally measure, TDVF memory via KVM_TDX_INIT_MEM_REGION as
part of finalizing the TD.

Signed-off-by: Isaku Yamahata 
Co-developed-by: Sean Christopherson 
Signed-off-by: Sean Christopherson 
---
 target/i386/kvm/tdx.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index 12b2e02fa2..0cd649dd01 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -85,10 +85,26 @@ static void tdx_finalize_vm(Notifier *notifier, void 
*unused)
 {
 MachineState *ms = MACHINE(qdev_get_machine());
 TdxGuest *tdx = TDX_GUEST(ms->cgs);
+TdxFirmwareEntry *entry;
 
 tdvf_hob_create(tdx, tdx_get_hob_entry(tdx));
 
+for_each_fw_entry(>fw, entry) {
+struct kvm_tdx_init_mem_region mem_region = {
+.source_addr = (__u64)entry->mem_ptr,
+.gpa = entry->address,
+.nr_pages = entry->size / 4096,
+};
+
+__u32 metadata = entry->attributes & TDVF_SECTION_ATTRIBUTES_EXTENDMR ?
+ KVM_TDX_MEASURE_MEMORY_REGION : 0;
+
+tdx_ioctl(KVM_TDX_INIT_MEM_REGION, metadata, _region);
+}
+
 tdx_ioctl(KVM_TDX_FINALIZE_VM, 0, NULL);
+
+tdx->parent_obj.ready = true;
 }
 
 static Notifier tdx_machine_done_late_notify = {
@@ -301,7 +317,6 @@ static void tdx_guest_init(Object *obj)
 {
 TdxGuest *tdx = TDX_GUEST(obj);
 
-tdx->parent_obj.ready = true;
 qemu_mutex_init(>lock);
 
 tdx->debug = false;
-- 
2.25.1




[RFC PATCH v2 14/44] i386/tdx: Frame in the call for KVM_TDX_INIT_VCPU

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Signed-off-by: Sean Christopherson 
Signed-off-by: Isaku Yamahata 
---
 include/sysemu/tdx.h   |  1 +
 target/i386/kvm/kvm.c  |  8 
 target/i386/kvm/tdx-stub.c |  4 
 target/i386/kvm/tdx.c  | 20 
 4 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/include/sysemu/tdx.h b/include/sysemu/tdx.h
index 36a901e723..03461b6ae8 100644
--- a/include/sysemu/tdx.h
+++ b/include/sysemu/tdx.h
@@ -8,5 +8,6 @@ bool kvm_has_tdx(KVMState *s);
 #endif
 
 void tdx_pre_create_vcpu(CPUState *cpu);
+void tdx_post_init_vcpu(CPUState *cpu);
 
 #endif
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 25dcecd60c..af6b5f350e 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -4122,6 +4122,14 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
 
 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
 
+/*
+ * level == KVM_PUT_FULL_STATE is only set by
+ * kvm_cpu_synchronize_post_init() after initialization
+ */
+if (vm_type == KVM_X86_TDX_VM && level == KVM_PUT_FULL_STATE) {
+tdx_post_init_vcpu(cpu);
+}
+
 /* TODO: Allow accessing guest state for debug TDs. */
 if (vm_type == KVM_X86_TDX_VM) {
 return 0;
diff --git a/target/i386/kvm/tdx-stub.c b/target/i386/kvm/tdx-stub.c
index 93d5913c89..93afe07ddb 100644
--- a/target/i386/kvm/tdx-stub.c
+++ b/target/i386/kvm/tdx-stub.c
@@ -12,3 +12,7 @@ bool kvm_has_tdx(KVMState *s)
 void tdx_pre_create_vcpu(CPUState *cpu)
 {
 }
+
+void tdx_post_init_vcpu(CPUState *cpu)
+{
+}
diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index b1e4f27c9a..67fb03b4b5 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -38,7 +38,7 @@ bool kvm_has_tdx(KVMState *s)
 return !!(kvm_check_extension(s, KVM_CAP_VM_TYPES) & BIT(KVM_X86_TDX_VM));
 }
 
-static void __tdx_ioctl(int ioctl_no, const char *ioctl_name,
+static void __tdx_ioctl(void *state, int ioctl_no, const char *ioctl_name,
 __u32 metadata, void *data)
 {
 struct kvm_tdx_cmd tdx_cmd;
@@ -51,17 +51,21 @@ static void __tdx_ioctl(int ioctl_no, const char 
*ioctl_name,
 tdx_cmd.data = (__u64)(unsigned long)data;
 
 if (ioctl_no == KVM_TDX_CAPABILITIES) {
-r = kvm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, _cmd);
+r = kvm_ioctl(state, KVM_MEMORY_ENCRYPT_OP, _cmd);
+} else if (ioctl_no == KVM_TDX_INIT_VCPU) {
+r = kvm_vcpu_ioctl(state, KVM_MEMORY_ENCRYPT_OP, _cmd);
 } else {
-r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, _cmd);
+r = kvm_vm_ioctl(state, KVM_MEMORY_ENCRYPT_OP, _cmd);
 }
 if (r) {
 error_report("%s failed: %s", ioctl_name, strerror(-r));
 exit(1);
 }
 }
+#define _tdx_ioctl(cpu, ioctl_no, metadata, data) \
+__tdx_ioctl(cpu, ioctl_no, stringify(ioctl_no), metadata, data)
 #define tdx_ioctl(ioctl_no, metadata, data) \
-__tdx_ioctl(ioctl_no, stringify(ioctl_no), metadata, data)
+_tdx_ioctl(kvm_state, ioctl_no, metadata, data)
 
 static void tdx_finalize_vm(Notifier *notifier, void *unused)
 {
@@ -219,6 +223,14 @@ out:
 qemu_mutex_unlock(>lock);
 }
 
+void tdx_post_init_vcpu(CPUState *cpu)
+{
+CPUX86State *env = _CPU(cpu)->env;
+
+_tdx_ioctl(cpu, KVM_TDX_INIT_VCPU, 0,
+   (void *)(unsigned long)env->regs[R_ECX]);
+}
+
 static bool tdx_guest_get_debug(Object *obj, Error **errp)
 {
 TdxGuest *tdx = TDX_GUEST(obj);
-- 
2.25.1




[RFC PATCH v2 41/44] ioapic: add property to disallow INIT/SIPI delivery mode

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Add a property to prevent ioapic from setting INIT/SIPI delivery mode.
Without this guard, qemu can result in unexpected behavior.

Signed-off-by: Isaku Yamahata 
---
 hw/intc/ioapic.c  | 19 +++
 hw/intc/ioapic_common.c   | 21 +
 include/hw/i386/ioapic_internal.h |  1 +
 3 files changed, 41 insertions(+)

diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index 1815fbd282..f7eb9f7146 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -396,6 +396,22 @@ ioapic_fix_smi_unsupported(uint64_t *entry)
 }
 }
 
+static inline void
+ioapic_fix_init_sipi_unsupported(uint64_t *entry)
+{
+uint64_t delmode = *entry & IOAPIC_LVT_DELIV_MODE;
+if (delmode == IOAPIC_DM_INIT << IOAPIC_LVT_DELIV_MODE_SHIFT ||
+delmode == IOAPIC_DM_SIPI << IOAPIC_LVT_DELIV_MODE_SHIFT) {
+/*
+ * ignore a request for delivery mode of lowest SMI
+ */
+warn_report_once("attempting to set delivery mode to INIT/SIPI"
+ "which is not supported");
+*entry &= ~IOAPIC_LVT_DELIV_MODE;
+*entry |= IOAPIC_DM_FIXED << IOAPIC_LVT_DELIV_MODE_SHIFT;
+}
+}
+
 static void
 ioapic_mem_write(void *opaque, hwaddr addr, uint64_t val,
  unsigned int size)
@@ -442,6 +458,9 @@ ioapic_mem_write(void *opaque, hwaddr addr, uint64_t val,
 if (s->smi_unsupported) {
 ioapic_fix_smi_unsupported(>ioredtbl[index]);
 }
+if (s->init_sipi_unsupported) {
+ioapic_fix_init_sipi_unsupported(>ioredtbl[index]);
+}
 ioapic_fix_edge_remote_irr(>ioredtbl[index]);
 ioapic_service(s);
 }
diff --git a/hw/intc/ioapic_common.c b/hw/intc/ioapic_common.c
index b8ef7efbad..018bacbf96 100644
--- a/hw/intc/ioapic_common.c
+++ b/hw/intc/ioapic_common.c
@@ -185,6 +185,23 @@ static void ioapic_common_set_smi_unsupported(Object *obj, 
bool value,
 s->smi_unsupported = value;
 }
 
+static bool ioapic_common_get_init_sipi_unsupported(Object *obj, Error **errp)
+{
+IOAPICCommonState *s = IOAPIC_COMMON(obj);
+return s->init_sipi_unsupported;
+}
+
+static void ioapic_common_set_init_sipi_unsupported(Object *obj, bool value,
+   Error **errp)
+{
+DeviceState *dev = DEVICE(obj);
+IOAPICCommonState *s = IOAPIC_COMMON(obj);
+/* only disabling before realize is allowed */
+assert(!dev->realized);
+assert(!s->init_sipi_unsupported);
+s->init_sipi_unsupported = value;
+}
+
 static void ioapic_common_init(Object *obj)
 {
 object_property_add_bool(obj, "level_trigger_unsupported",
@@ -194,6 +211,10 @@ static void ioapic_common_init(Object *obj)
 object_property_add_bool(obj, "smi_unsupported",
  ioapic_common_get_smi_unsupported,
  ioapic_common_set_smi_unsupported);
+
+object_property_add_bool(obj, "init_sipi_unsupported",
+ ioapic_common_get_init_sipi_unsupported,
+ ioapic_common_set_init_sipi_unsupported);
 }
 
 static void ioapic_common_realize(DeviceState *dev, Error **errp)
diff --git a/include/hw/i386/ioapic_internal.h 
b/include/hw/i386/ioapic_internal.h
index 46f22a4f85..634b97426d 100644
--- a/include/hw/i386/ioapic_internal.h
+++ b/include/hw/i386/ioapic_internal.h
@@ -105,6 +105,7 @@ struct IOAPICCommonState {
 Notifier machine_done;
 bool level_trigger_unsupported;
 bool smi_unsupported;
+bool init_sipi_unsupported;
 uint8_t version;
 uint64_t irq_count[IOAPIC_NUM_PINS];
 int irq_level[IOAPIC_NUM_PINS];
-- 
2.25.1




[RFC PATCH v2 20/44] i386/tdx: Parse tdx metadata and store the result into TdxGuestState

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Add support for loading TDX's Trusted Domain Virtual Firmware (TDVF) via
the generic loader.  Prioritize the TDVF above plain hex to avoid false
positives with hex (TDVF has explicit metadata to confirm it's a TDVF).

Enumerate TempMem as added, private memory, i.e. E820_RESERVED,
otherwise TDVF will interpret the whole shebang as MMIO and complain
that the aperture overlaps other MMIO regions.

Signed-off-by: Isaku Yamahata 
Co-developed-by: Sean Christopherson 
Signed-off-by: Sean Christopherson 
Reported-by: Min M. Xu 
---
 hw/core/generic-loader.c |   5 +
 hw/core/meson.build  |   3 +
 hw/core/tdvf-stub.c  |   6 +
 hw/i386/meson.build  |   1 +
 hw/i386/tdvf.c   | 312 +++
 include/sysemu/tdvf.h|   6 +
 target/i386/kvm/tdx.h|  26 
 7 files changed, 359 insertions(+)
 create mode 100644 hw/core/tdvf-stub.c
 create mode 100644 hw/i386/tdvf.c
 create mode 100644 include/sysemu/tdvf.h

diff --git a/hw/core/generic-loader.c b/hw/core/generic-loader.c
index d14f932eea..ee2f49b47a 100644
--- a/hw/core/generic-loader.c
+++ b/hw/core/generic-loader.c
@@ -34,6 +34,7 @@
 #include "hw/core/cpu.h"
 #include "sysemu/dma.h"
 #include "sysemu/reset.h"
+#include "sysemu/tdvf.h"
 #include "hw/boards.h"
 #include "hw/loader.h"
 #include "hw/qdev-properties.h"
@@ -147,6 +148,10 @@ static void generic_loader_realize(DeviceState *dev, Error 
**errp)
   as);
 }
 
+if (size < 0) {
+size = load_tdvf(s->file);
+}
+
 if (size < 0) {
 size = load_targphys_hex_as(s->file, , as);
 }
diff --git a/hw/core/meson.build b/hw/core/meson.build
index 18f44fb7c2..ec943debf1 100644
--- a/hw/core/meson.build
+++ b/hw/core/meson.build
@@ -24,6 +24,9 @@ common_ss.add(when: 'CONFIG_REGISTER', if_true: 
files('register.c'))
 common_ss.add(when: 'CONFIG_SPLIT_IRQ', if_true: files('split-irq.c'))
 common_ss.add(when: 'CONFIG_XILINX_AXI', if_true: files('stream.c'))
 
+common_ss.add(when: 'CONFIG_TDX', if_false: files('tdvf-stub.c'))
+common_ss.add(when: 'CONFIG_ALL', if_true: files('tdvf-stub.c'))
+
 softmmu_ss.add(files(
   'cpu-sysemu.c',
   'fw-path-provider.c',
diff --git a/hw/core/tdvf-stub.c b/hw/core/tdvf-stub.c
new file mode 100644
index 00..5f2586dd70
--- /dev/null
+++ b/hw/core/tdvf-stub.c
@@ -0,0 +1,6 @@
+#include "sysemu/tdvf.h"
+
+int load_tdvf(const char *filename)
+{
+return -1;
+}
diff --git a/hw/i386/meson.build b/hw/i386/meson.build
index e5d109f5c6..945e805525 100644
--- a/hw/i386/meson.build
+++ b/hw/i386/meson.build
@@ -24,6 +24,7 @@ i386_ss.add(when: 'CONFIG_PC', if_true: files(
   'pc_sysfw.c',
   'acpi-build.c',
   'port92.c'))
+i386_ss.add(when: 'CONFIG_TDX', if_true: files('tdvf.c'))
 
 subdir('kvm')
 subdir('xen')
diff --git a/hw/i386/tdvf.c b/hw/i386/tdvf.c
new file mode 100644
index 00..9b0065d656
--- /dev/null
+++ b/hw/i386/tdvf.c
@@ -0,0 +1,312 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+
+ * Copyright (c) 2020 Intel Corporation
+ * Author: Isaku Yamahata 
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/units.h"
+#include "cpu.h"
+#include "exec/hwaddr.h"
+#include "hw/boards.h"
+#include "hw/i386/e820_memory_layout.h"
+#include "hw/i386/tdvf.h"
+#include "hw/i386/x86.h"
+#include "hw/loader.h"
+#include "sysemu/tdx.h"
+#include "sysemu/tdvf.h"
+#include "target/i386/kvm/tdx.h"
+
+static void tdvf_init_ram_memory(MachineState *ms, TdxFirmwareEntry *entry)
+{
+void *ram_ptr = memory_region_get_ram_ptr(ms->ram);
+X86MachineState *x86ms = X86_MACHINE(ms);
+
+if (entry->type == TDVF_SECTION_TYPE_BFV ||
+entry->type == TDVF_SECTION_TYPE_CFV) {
+error_report("TDVF type %u addr 0x%" PRIx64 " in RAM (disallowed)",
+ entry->type, entry->address);
+exit(1);
+}
+
+if (entry->address < 4 * GiB) {
+en

[RFC PATCH v2 44/44] i386/tdx: disable S3/S4 unconditionally

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Disable S3/S4 unconditionally when TDX is enabled.  Because cpu state is
protected, it's not allowed to reset cpu state.  So S3/S4 can't be
supported.

Signed-off-by: Isaku Yamahata 
---
 target/i386/kvm/tdx.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index 0621317b0a..0dd6d94c2a 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -31,6 +31,9 @@
 #include "sysemu/tdx.h"
 #include "tdx.h"
 
+#include "hw/southbridge/piix.h"
+#include "hw/i386/ich9.h"
+
 #define TDX1_TD_ATTRIBUTE_DEBUG BIT_ULL(0)
 #define TDX1_TD_ATTRIBUTE_PERFMON BIT_ULL(63)
 #define TDX1_MIN_TSC_FREQUENCY_KHZ (100 * 1000)
@@ -103,10 +106,27 @@ static TdxFirmwareEntry *tdx_get_hob_entry(TdxGuest *tdx)
 
 static void tdx_finalize_vm(Notifier *notifier, void *unused)
 {
+Object *pm;
+bool ambig;
 MachineState *ms = MACHINE(qdev_get_machine());
 TdxGuest *tdx = TDX_GUEST(ms->cgs);
 TdxFirmwareEntry *entry;
 
+/*
+ * object look up logic is copied from acpi_get_pm_info()
+ * @ hw/ie86/acpi-build.c
+ * This property override needs to be done after machine initialization
+ * as there is no ordering of creation of objects/properties.
+ */
+pm = object_resolve_path_type("", TYPE_PIIX4_PM, );
+if (ambig || !pm) {
+pm = object_resolve_path_type("", TYPE_ICH9_LPC_DEVICE, );
+}
+if (!ambig && pm) {
+object_property_set_uint(pm, ACPI_PM_PROP_S3_DISABLED, 1, NULL);
+object_property_set_uint(pm, ACPI_PM_PROP_S4_DISABLED, 1, NULL);
+}
+
 tdvf_hob_create(tdx, tdx_get_hob_entry(tdx));
 
 for_each_fw_entry(>fw, entry) {
-- 
2.25.1




[RFC PATCH v2 39/44] ioapic: add property to disallow SMI delivery mode

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Add a property to prevent ioapic from setting SMI delivery mode.  Without
this guard, qemu can result in unexpected behavior.

Signed-off-by: Isaku Yamahata 
---
 hw/intc/ioapic.c  | 18 ++
 hw/intc/ioapic_common.c   | 20 
 include/hw/i386/ioapic_internal.h |  1 +
 3 files changed, 39 insertions(+)

diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index 6d61744961..1815fbd282 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -381,6 +381,21 @@ ioapic_fix_level_trigger_unsupported(uint64_t *entry)
 }
 }
 
+static inline void
+ioapic_fix_smi_unsupported(uint64_t *entry)
+{
+if ((*entry & IOAPIC_LVT_DELIV_MODE) ==
+IOAPIC_DM_PMI << IOAPIC_LVT_DELIV_MODE_SHIFT) {
+/*
+ * ignore a request for delivery mode of lowest SMI
+ */
+warn_report_once("attempting to set delivery mode to SMI"
+ "which is not supported");
+*entry &= ~IOAPIC_LVT_DELIV_MODE;
+*entry |= IOAPIC_DM_FIXED << IOAPIC_LVT_DELIV_MODE_SHIFT;
+}
+}
+
 static void
 ioapic_mem_write(void *opaque, hwaddr addr, uint64_t val,
  unsigned int size)
@@ -424,6 +439,9 @@ ioapic_mem_write(void *opaque, hwaddr addr, uint64_t val,
 if (s->level_trigger_unsupported) {
 ioapic_fix_level_trigger_unsupported(>ioredtbl[index]);
 }
+if (s->smi_unsupported) {
+ioapic_fix_smi_unsupported(>ioredtbl[index]);
+}
 ioapic_fix_edge_remote_irr(>ioredtbl[index]);
 ioapic_service(s);
 }
diff --git a/hw/intc/ioapic_common.c b/hw/intc/ioapic_common.c
index 07ee142470..b8ef7efbad 100644
--- a/hw/intc/ioapic_common.c
+++ b/hw/intc/ioapic_common.c
@@ -168,12 +168,32 @@ static void 
ioapic_common_set_level_trigger_unsupported(Object *obj, bool value,
 s->level_trigger_unsupported = value;
 }
 
+static bool ioapic_common_get_smi_unsupported(Object *obj, Error **errp)
+{
+IOAPICCommonState *s = IOAPIC_COMMON(obj);
+return s->smi_unsupported;
+}
+
+static void ioapic_common_set_smi_unsupported(Object *obj, bool value,
+   Error **errp)
+{
+DeviceState *dev = DEVICE(obj);
+IOAPICCommonState *s = IOAPIC_COMMON(obj);
+/* only disabling before realize is allowed */
+assert(!dev->realized);
+assert(!s->smi_unsupported);
+s->smi_unsupported = value;
+}
+
 static void ioapic_common_init(Object *obj)
 {
 object_property_add_bool(obj, "level_trigger_unsupported",
  ioapic_common_get_level_trigger_unsupported,
  ioapic_common_set_level_trigger_unsupported);
 
+object_property_add_bool(obj, "smi_unsupported",
+ ioapic_common_get_smi_unsupported,
+ ioapic_common_set_smi_unsupported);
 }
 
 static void ioapic_common_realize(DeviceState *dev, Error **errp)
diff --git a/include/hw/i386/ioapic_internal.h 
b/include/hw/i386/ioapic_internal.h
index 20f2fc7897..46f22a4f85 100644
--- a/include/hw/i386/ioapic_internal.h
+++ b/include/hw/i386/ioapic_internal.h
@@ -104,6 +104,7 @@ struct IOAPICCommonState {
 uint64_t ioredtbl[IOAPIC_NUM_PINS];
 Notifier machine_done;
 bool level_trigger_unsupported;
+bool smi_unsupported;
 uint8_t version;
 uint64_t irq_count[IOAPIC_NUM_PINS];
 int irq_level[IOAPIC_NUM_PINS];
-- 
2.25.1




[RFC PATCH v2 04/44] vl: Introduce machine_init_done_late notifier

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Introduce a new notifier, machine_init_done_late, that is notified after
machine_init_done.  This will be used by TDX to generate the HOB for its
virtual firmware, which needs to be done after all guest memory has been
added, i.e. after machine_init_done notifiers have run.  Some code
registers memory by machine_init_done().

Signed-off-by: Isaku Yamahata 
---
 hw/core/machine.c   | 26 ++
 include/sysemu/sysemu.h |  2 ++
 2 files changed, 28 insertions(+)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index ffc076ae84..66c39cf72a 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -1278,6 +1278,31 @@ void qemu_remove_machine_init_done_notifier(Notifier 
*notify)
 notifier_remove(notify);
 }
 
+static NotifierList machine_init_done_late_notifiers =
+NOTIFIER_LIST_INITIALIZER(machine_init_done_late_notifiers);
+
+static bool machine_init_done_late;
+
+void qemu_add_machine_init_done_late_notifier(Notifier *notify)
+{
+notifier_list_add(_init_done_late_notifiers, notify);
+if (machine_init_done_late) {
+notify->notify(notify, NULL);
+}
+}
+
+void qemu_remove_machine_init_done_late_notifier(Notifier *notify)
+{
+notifier_remove(notify);
+}
+
+
+static void qemu_run_machine_init_done_late_notifiers(void)
+{
+machine_init_done_late = true;
+notifier_list_notify(_init_done_late_notifiers, NULL);
+}
+
 void qdev_machine_creation_done(void)
 {
 cpu_synchronize_all_post_init();
@@ -1311,6 +1336,7 @@ void qdev_machine_creation_done(void)
 if (rom_check_and_register_reset() != 0) {
 exit(1);
 }
+qemu_run_machine_init_done_late_notifiers();
 
 replay_start();
 
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 8fae667172..d44f8cf778 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -19,6 +19,8 @@ void qemu_remove_exit_notifier(Notifier *notify);
 void qemu_run_machine_init_done_notifiers(void);
 void qemu_add_machine_init_done_notifier(Notifier *notify);
 void qemu_remove_machine_init_done_notifier(Notifier *notify);
+void qemu_add_machine_init_done_late_notifier(Notifier *notify);
+void qemu_remove_machine_init_done_late_notifier(Notifier *notify);
 
 void configure_rtc(QemuOpts *opts);
 
-- 
2.25.1




[RFC PATCH v2 18/44] hw/i386: refactor e820_add_entry()

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

The following patch will utilize this refactoring.

Signed-off-by: Isaku Yamahata 
---
 hw/i386/e820_memory_layout.c | 42 
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
index bcf9eaf837..d9bb11c02a 100644
--- a/hw/i386/e820_memory_layout.c
+++ b/hw/i386/e820_memory_layout.c
@@ -14,31 +14,45 @@ static size_t e820_entries;
 struct e820_table e820_reserve;
 struct e820_entry *e820_table;
 
-int e820_add_entry(uint64_t address, uint64_t length, uint32_t type)
+static int e820_append_reserve(uint64_t address, uint64_t length, uint32_t 
type)
 {
 int index = le32_to_cpu(e820_reserve.count);
 struct e820_entry *entry;
 
-if (type != E820_RAM) {
-/* old FW_CFG_E820_TABLE entry -- reservations only */
-if (index >= E820_NR_ENTRIES) {
-return -EBUSY;
-}
-entry = _reserve.entry[index++];
+/* old FW_CFG_E820_TABLE entry -- reservations only */
+if (index >= E820_NR_ENTRIES) {
+return -EBUSY;
+}
+entry = _reserve.entry[index++];
 
-entry->address = cpu_to_le64(address);
-entry->length = cpu_to_le64(length);
-entry->type = cpu_to_le32(type);
+entry->address = cpu_to_le64(address);
+entry->length = cpu_to_le64(length);
+entry->type = cpu_to_le32(type);
 
-e820_reserve.count = cpu_to_le32(index);
-}
+e820_reserve.count = cpu_to_le32(index);
+return 0;
+}
 
-/* new "etc/e820" file -- include ram too */
-e820_table = g_renew(struct e820_entry, e820_table, e820_entries + 1);
+static void e820_append_entry(uint64_t address, uint64_t length, uint32_t type)
+{
 e820_table[e820_entries].address = cpu_to_le64(address);
 e820_table[e820_entries].length = cpu_to_le64(length);
 e820_table[e820_entries].type = cpu_to_le32(type);
 e820_entries++;
+}
+
+int e820_add_entry(uint64_t address, uint64_t length, uint32_t type)
+{
+if (type != E820_RAM) {
+int ret = e820_append_reserve(address, length, type);
+if (ret) {
+return ret;
+}
+}
+
+/* new "etc/e820" file -- include ram too */
+e820_table = g_renew(struct e820_entry, e820_table, e820_entries + 1);
+e820_append_entry(address, length, type);
 
 return e820_entries;
 }
-- 
2.25.1




[RFC PATCH v2 40/44] hw/i386: add a flag to disallow SMI

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Add a new flag to X86Machine to disallow SMI and pass it to ioapic creation
so that ioapic disallows delivery mode of SMI.

Signed-off-by: Isaku Yamahata 
---
 hw/i386/microvm.c |  6 --
 hw/i386/pc_piix.c |  3 ++-
 hw/i386/pc_q35.c  |  3 ++-
 hw/i386/x86.c | 11 +--
 include/hw/i386/x86.h |  7 +--
 5 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
index 9b03d051ca..7504324891 100644
--- a/hw/i386/microvm.c
+++ b/hw/i386/microvm.c
@@ -175,10 +175,12 @@ static void microvm_devices_init(MicrovmMachineState *mms)
   _abort);
 isa_bus_irqs(isa_bus, x86ms->gsi);
 
-ioapic_init_gsi(gsi_state, "machine", x86ms->eoi_intercept_unsupported);
+ioapic_init_gsi(gsi_state, "machine", x86ms->eoi_intercept_unsupported,
+x86ms->smi_unsupported);
 if (ioapics > 1) {
 x86ms->ioapic2 = ioapic_init_secondary(
-gsi_state, x86ms->eoi_intercept_unsupported);
+gsi_state, x86ms->eoi_intercept_unsupported,
+x86ms->smi_unsupported);
 }
 
 kvmclock_create(true);
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index a601c4a916..0958035bf8 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -223,7 +223,8 @@ static void pc_init1(MachineState *machine,
 }
 
 if (pcmc->pci_enabled) {
-ioapic_init_gsi(gsi_state, "i440fx", x86ms->eoi_intercept_unsupported);
+ioapic_init_gsi(gsi_state, "i440fx", x86ms->eoi_intercept_unsupported,
+x86ms->smi_unsupported);
 }
 
 if (tcg_enabled()) {
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 464463766c..1ab8a6a78b 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -256,7 +256,8 @@ static void pc_q35_init(MachineState *machine)
 }
 
 if (pcmc->pci_enabled) {
-ioapic_init_gsi(gsi_state, "q35", x86ms->eoi_intercept_unsupported);
+ioapic_init_gsi(gsi_state, "q35", x86ms->eoi_intercept_unsupported,
+x86ms->smi_unsupported);
 }
 
 if (tcg_enabled()) {
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index 88c365b72d..3dc36e3590 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -609,7 +609,8 @@ void gsi_handler(void *opaque, int n, int level)
 }
 
 void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name,
- bool level_trigger_unsupported)
+ bool level_trigger_unsupported,
+ bool smi_unsupported)
 {
 DeviceState *dev;
 SysBusDevice *d;
@@ -625,6 +626,8 @@ void ioapic_init_gsi(GSIState *gsi_state, const char 
*parent_name,
   "ioapic", OBJECT(dev));
 object_property_set_bool(OBJECT(dev), "level_trigger_unsupported",
  level_trigger_unsupported, NULL);
+object_property_set_bool(OBJECT(dev), "smi_unsupported",
+ smi_unsupported, NULL);
 d = SYS_BUS_DEVICE(dev);
 sysbus_realize_and_unref(d, _fatal);
 sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS);
@@ -635,7 +638,8 @@ void ioapic_init_gsi(GSIState *gsi_state, const char 
*parent_name,
 }
 
 DeviceState *ioapic_init_secondary(GSIState *gsi_state,
-   bool level_trigger_unsupported)
+   bool level_trigger_unsupported,
+   bool smi_unsupported)
 {
 DeviceState *dev;
 SysBusDevice *d;
@@ -644,6 +648,8 @@ DeviceState *ioapic_init_secondary(GSIState *gsi_state,
 dev = qdev_new(TYPE_IOAPIC);
 object_property_set_bool(OBJECT(dev), "level_trigger_unsupported",
  level_trigger_unsupported, NULL);
+object_property_set_bool(OBJECT(dev), "smi_unsupported",
+ smi_unsupported, NULL);
 d = SYS_BUS_DEVICE(dev);
 sysbus_realize_and_unref(d, _fatal);
 sysbus_mmio_map(d, 0, IO_APIC_SECONDARY_ADDRESS);
@@ -1318,6 +1324,7 @@ static void x86_machine_initfn(Object *obj)
 x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8);
 x86ms->bus_lock_ratelimit = 0;
 x86ms->eoi_intercept_unsupported = false;
+x86ms->smi_unsupported = false;
 
 object_property_add_str(obj, "kvm-type",
 x86_get_kvm_type, x86_set_kvm_type);
diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h
index 7536e5fb8c..3d1d74d171 100644
--- a/include/hw/i386/x86.h
+++ b/include/hw/i386/x86.h
@@ -64,6 +64,7 @@ struct X86MachineState {
 unsigned apic_id_limit;
 uint16_t boot_cpus;
 bool eoi_intercept_unsupported;
+bool smi_unsupported;
 
 OnOffAuto smm;
 OnOffAuto acpi;
@@ -141,8 +142,10 @@ typedef struct GSIState {
 qemu_irq x86_allocate_cpu_irq(void);
 void gsi_handler

[RFC PATCH v2 13/44] i386/tdx: Frame in tdx_get_supported_cpuid with KVM_TDX_CAPABILITIES

2021-07-07 Thread isaku . yamahata
From: Sean Christopherson 

Add support for grabbing KVM_TDX_CAPABILITIES and use the new
kvm_get_supported_cpuid() hook to adjust the supported XCR0 bits.

Add TODOs for the remaining work.

Signed-off-by: Sean Christopherson 
Signed-off-by: Isaku Yamahata 
---
 target/i386/kvm/kvm.c |  2 ++
 target/i386/kvm/tdx.c | 79 ---
 target/i386/kvm/tdx.h |  2 ++
 3 files changed, 78 insertions(+), 5 deletions(-)

diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 5742fa4806..25dcecd60c 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -448,6 +448,8 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t 
function,
 ret |= 1U << KVM_HINTS_REALTIME;
 }
 
+tdx_get_supported_cpuid(s, function, index, reg, );
+
 return ret;
 }
 
diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index f8c7560fc8..b1e4f27c9a 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -21,6 +21,7 @@
 #include "hw/boards.h"
 #include "qapi/error.h"
 #include "qom/object_interfaces.h"
+#include "standard-headers/asm-x86/kvm_para.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/kvm.h"
 #include "sysemu/kvm_int.h"
@@ -49,7 +50,11 @@ static void __tdx_ioctl(int ioctl_no, const char *ioctl_name,
 tdx_cmd.metadata = metadata;
 tdx_cmd.data = (__u64)(unsigned long)data;
 
-r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, _cmd);
+if (ioctl_no == KVM_TDX_CAPABILITIES) {
+r = kvm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, _cmd);
+} else {
+r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, _cmd);
+}
 if (r) {
 error_report("%s failed: %s", ioctl_name, strerror(-r));
 exit(1);
@@ -67,6 +72,18 @@ static Notifier tdx_machine_done_late_notify = {
 .notify = tdx_finalize_vm,
 };
 
+#define TDX1_MAX_NR_CPUID_CONFIGS 6
+
+static struct {
+struct kvm_tdx_capabilities __caps;
+struct kvm_tdx_cpuid_config __cpuid_configs[TDX1_MAX_NR_CPUID_CONFIGS];
+} __tdx_caps;
+
+static struct kvm_tdx_capabilities *tdx_caps = (void *)&__tdx_caps;
+
+#define XCR0_MASK (MAKE_64BIT_MASK(0, 8) | BIT_ULL(9))
+#define XSS_MASK (~XCR0_MASK)
+
 int tdx_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
 {
 TdxGuest *tdx = (TdxGuest *)object_dynamic_cast(OBJECT(cgs),
@@ -75,10 +92,65 @@ int tdx_kvm_init(ConfidentialGuestSupport *cgs, Error 
**errp)
 return 0;
 }
 
+QEMU_BUILD_BUG_ON(sizeof(__tdx_caps) !=
+  sizeof(struct kvm_tdx_capabilities) +
+  sizeof(struct kvm_tdx_cpuid_config) *
+  TDX1_MAX_NR_CPUID_CONFIGS);
+
+tdx_caps->nr_cpuid_configs = TDX1_MAX_NR_CPUID_CONFIGS;
+tdx_ioctl(KVM_TDX_CAPABILITIES, 0, tdx_caps);
+
 qemu_add_machine_init_done_late_notifier(_machine_done_late_notify);
+
 return 0;
 }
 
+void tdx_get_supported_cpuid(KVMState *s, uint32_t function,
+ uint32_t index, int reg, uint32_t *ret)
+{
+MachineState *ms = MACHINE(qdev_get_machine());
+TdxGuest *tdx = (TdxGuest *)object_dynamic_cast(OBJECT(ms->cgs),
+TYPE_TDX_GUEST);
+
+if (!tdx) {
+return;
+}
+
+switch (function) {
+case 1:
+if (reg == R_ECX) {
+*ret &= ~CPUID_EXT_VMX;
+}
+break;
+case 0xd:
+if (index == 0) {
+if (reg == R_EAX) {
+*ret &= (uint32_t)tdx_caps->xfam_fixed0 & XCR0_MASK;
+*ret |= (uint32_t)tdx_caps->xfam_fixed1 & XCR0_MASK;
+} else if (reg == R_EDX) {
+*ret &= (tdx_caps->xfam_fixed0 & XCR0_MASK) >> 32;
+*ret |= (tdx_caps->xfam_fixed1 & XCR0_MASK) >> 32;
+}
+} else if (index == 1) {
+/* TODO: Adjust XSS when it's supported. */
+}
+break;
+case KVM_CPUID_FEATURES:
+if (reg == R_EAX) {
+*ret &= ~((1ULL << KVM_FEATURE_CLOCKSOURCE) |
+  (1ULL << KVM_FEATURE_CLOCKSOURCE2) |
+  (1ULL << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+  (1ULL << KVM_FEATURE_ASYNC_PF) |
+  (1ULL << KVM_FEATURE_ASYNC_PF_VMEXIT) |
+  (1ULL << KVM_FEATURE_ASYNC_PF_INT));
+}
+break;
+default:
+/* TODO: Use tdx_caps to adjust CPUID leafs. */
+break;
+}
+}
+
 void tdx_pre_create_vcpu(CPUState *cpu)
 {
 struct {
@@ -105,10 +177,7 @@ void tdx_pre_create_vcpu(CPUState *cpu)
 return;
 }
 
-/* HACK: Remove MPX support, which is not allowed by TDX. */
-env->features[FEAT_XSAVE_COMP_LO] &= ~(XSTATE_BNDREGS_MASK |
-   XSTATE_BNDCSR_MASK);
-
+/* TODO: Us

[RFC PATCH v2 19/44] hw/i386/e820: introduce a helper function to change type of e820

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Introduce a helper function, e820_change_type(), that change
the type of subregion of e820 entry.
The following patch uses it.

Signed-off-by: Isaku Yamahata 
---
 hw/i386/e820_memory_layout.c | 72 
 hw/i386/e820_memory_layout.h |  1 +
 2 files changed, 73 insertions(+)

diff --git a/hw/i386/e820_memory_layout.c b/hw/i386/e820_memory_layout.c
index d9bb11c02a..109c4f715a 100644
--- a/hw/i386/e820_memory_layout.c
+++ b/hw/i386/e820_memory_layout.c
@@ -57,6 +57,78 @@ int e820_add_entry(uint64_t address, uint64_t length, 
uint32_t type)
 return e820_entries;
 }
 
+int e820_change_type(uint64_t address, uint64_t length, uint32_t type)
+{
+size_t i;
+
+if (type != E820_RAM) {
+int ret = e820_append_reserve(address, length, type);
+if (ret) {
+return ret;
+}
+}
+
+/* new "etc/e820" file -- include ram too */
+for (i = 0; i < e820_entries; i++) {
+struct e820_entry *e = _table[i];
+struct e820_entry tmp = {
+.address = le64_to_cpu(e->address),
+.length = le64_to_cpu(e->length),
+.type = le32_to_cpu(e->type),
+};
+/* overlap? */
+if (address + length < tmp.address ||
+tmp.address + tmp.length < address) {
+continue;
+}
+/*
+ * partial-overlap is not allowed.
+ * It is assumed that the region is completely contained within
+ * other region.
+ */
+if (address < tmp.address ||
+tmp.address + tmp.length < address + length) {
+return -EINVAL;
+}
+/* only real type change is allowed. */
+if (tmp.type == type) {
+return -EINVAL;
+}
+
+if (tmp.address == address &&
+tmp.address + tmp.length == address + length) {
+e->type = cpu_to_le32(type);
+return e820_entries;
+} else if (tmp.address == address) {
+e820_table = g_renew(struct e820_entry,
+ e820_table, e820_entries + 1);
+e = _table[i];
+e->address = cpu_to_le64(tmp.address + length);
+e820_append_entry(address, length, type);
+return e820_entries;
+} else if (tmp.address + tmp.length == address + length) {
+e820_table = g_renew(struct e820_entry,
+ e820_table, e820_entries + 1);
+e = _table[i];
+e->length = cpu_to_le64(tmp.length - length);
+e820_append_entry(address, length, type);
+return e820_entries;
+} else {
+e820_table = g_renew(struct e820_entry,
+ e820_table, e820_entries + 2);
+e = _table[i];
+e->length = cpu_to_le64(address - tmp.address);
+e820_append_entry(address, length, type);
+e820_append_entry(address + length,
+  tmp.address + tmp.length - (address + length),
+  tmp.type);
+return e820_entries;
+}
+}
+
+return -EINVAL;
+}
+
 int e820_get_num_entries(void)
 {
 return e820_entries;
diff --git a/hw/i386/e820_memory_layout.h b/hw/i386/e820_memory_layout.h
index 2a0ceb8b9c..5f27cee476 100644
--- a/hw/i386/e820_memory_layout.h
+++ b/hw/i386/e820_memory_layout.h
@@ -33,6 +33,7 @@ extern struct e820_table e820_reserve;
 extern struct e820_entry *e820_table;
 
 int e820_add_entry(uint64_t address, uint64_t length, uint32_t type);
+int e820_change_type(uint64_t address, uint64_t length, uint32_t type);
 int e820_get_num_entries(void);
 bool e820_get_entry(int index, uint32_t type,
 uint64_t *address, uint64_t *length);
-- 
2.25.1




[RFC PATCH v2 37/44] hw/i386: add option to forcibly report edge trigger in acpi tables

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

When level trigger isn't supported on x86 platform, forcibly report edge
trigger in acpi tables.

Signed-off-by: Isaku Yamahata 
---
 hw/i386/acpi-build.c  | 103 --
 hw/i386/acpi-common.c |  74 ++
 2 files changed, 124 insertions(+), 53 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 796ffc6f5c..d0d52258b9 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -866,7 +866,8 @@ static void build_dbg_aml(Aml *table)
 aml_append(table, scope);
 }
 
-static Aml *build_link_dev(const char *name, uint8_t uid, Aml *reg)
+static Aml *build_link_dev(const char *name, uint8_t uid, Aml *reg,
+   bool level_trigger_unsupported)
 {
 Aml *dev;
 Aml *crs;
@@ -878,7 +879,10 @@ static Aml *build_link_dev(const char *name, uint8_t uid, 
Aml *reg)
 aml_append(dev, aml_name_decl("_UID", aml_int(uid)));
 
 crs = aml_resource_template();
-aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH,
+aml_append(crs, aml_interrupt(AML_CONSUMER,
+  level_trigger_unsupported ?
+  AML_EDGE : AML_LEVEL,
+  AML_ACTIVE_HIGH,
   AML_SHARED, irqs, ARRAY_SIZE(irqs)));
 aml_append(dev, aml_name_decl("_PRS", crs));
 
@@ -902,7 +906,8 @@ static Aml *build_link_dev(const char *name, uint8_t uid, 
Aml *reg)
 return dev;
  }
 
-static Aml *build_gsi_link_dev(const char *name, uint8_t uid, uint8_t gsi)
+static Aml *build_gsi_link_dev(const char *name, uint8_t uid,
+   uint8_t gsi, bool level_trigger_unsupported)
 {
 Aml *dev;
 Aml *crs;
@@ -915,7 +920,10 @@ static Aml *build_gsi_link_dev(const char *name, uint8_t 
uid, uint8_t gsi)
 
 crs = aml_resource_template();
 irqs = gsi;
-aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL, AML_ACTIVE_HIGH,
+aml_append(crs, aml_interrupt(AML_CONSUMER,
+  level_trigger_unsupported ?
+  AML_EDGE : AML_LEVEL,
+  AML_ACTIVE_HIGH,
   AML_SHARED, , 1));
 aml_append(dev, aml_name_decl("_PRS", crs));
 
@@ -934,7 +942,7 @@ static Aml *build_gsi_link_dev(const char *name, uint8_t 
uid, uint8_t gsi)
 }
 
 /* _CRS method - get current settings */
-static Aml *build_iqcr_method(bool is_piix4)
+static Aml *build_iqcr_method(bool is_piix4, bool level_trigger_unsupported)
 {
 Aml *if_ctx;
 uint32_t irqs;
@@ -942,7 +950,9 @@ static Aml *build_iqcr_method(bool is_piix4)
 Aml *crs = aml_resource_template();
 
 irqs = 0;
-aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL,
+aml_append(crs, aml_interrupt(AML_CONSUMER,
+  level_trigger_unsupported ?
+  AML_EDGE : AML_LEVEL,
   AML_ACTIVE_HIGH, AML_SHARED, , 1));
 aml_append(method, aml_name_decl("PRR0", crs));
 
@@ -976,7 +986,7 @@ static Aml *build_irq_status_method(void)
 return method;
 }
 
-static void build_piix4_pci0_int(Aml *table)
+static void build_piix4_pci0_int(Aml *table, bool level_trigger_unsupported)
 {
 Aml *dev;
 Aml *crs;
@@ -997,12 +1007,16 @@ static void build_piix4_pci0_int(Aml *table)
 aml_append(sb_scope, field);
 
 aml_append(sb_scope, build_irq_status_method());
-aml_append(sb_scope, build_iqcr_method(true));
+aml_append(sb_scope, build_iqcr_method(true, level_trigger_unsupported));
 
-aml_append(sb_scope, build_link_dev("LNKA", 0, aml_name("PRQ0")));
-aml_append(sb_scope, build_link_dev("LNKB", 1, aml_name("PRQ1")));
-aml_append(sb_scope, build_link_dev("LNKC", 2, aml_name("PRQ2")));
-aml_append(sb_scope, build_link_dev("LNKD", 3, aml_name("PRQ3")));
+aml_append(sb_scope, build_link_dev("LNKA", 0, aml_name("PRQ0"),
+level_trigger_unsupported));
+aml_append(sb_scope, build_link_dev("LNKB", 1, aml_name("PRQ1"),
+level_trigger_unsupported));
+aml_append(sb_scope, build_link_dev("LNKC", 2, aml_name("PRQ2"),
+level_trigger_unsupported));
+aml_append(sb_scope, build_link_dev("LNKD", 3, aml_name("PRQ3"),
+level_trigger_unsupported));
 
 dev = aml_device("LNKS");
 {
@@ -1011,7 +1025,9 @@ static void build_piix4_pci0_int(Aml *table)
 
 crs = aml_resource_template();
 irqs = 9;
-aml_append(crs, aml_interrupt(AML_CONSUMER, AML_LEVEL,
+aml_append(crs, aml_interrup

[RFC PATCH v2 34/44] target/i386/tdx: set reboot action to shutdown when tdx

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

In TDX CPU state is also protected, thus vcpu state can't be reset by VMM.
It assumes -action reboot=shutdown instead of silently ignoring vcpu reset.

TDX module spec version 344425-002US doesn't support vcpu reset by VMM.  VM
needs to be destroyed and created again to emulate REBOOT_ACTION_RESET.
For simplicity, put its responsibility to management system like libvirt
because it's difficult for the current qemu implementation to destroy and
re-create KVM VM resources with keeping other resources.

If management system wants reboot behavior for its users, it needs to
 - set reboot_action to REBOOT_ACTION_SHUTDOWN,
 - set shutdown_action to SHUTDOWN_ACTION_PAUSE optionally and,
 - subscribe VM state change and on reboot, (destroy qemu if
   SHUTDOWN_ACTION_PAUSE and) start new qemu.

Signed-off-by: Isaku Yamahata 
---
 target/i386/kvm/tdx.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index 1316d95209..0621317b0a 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -25,6 +25,7 @@
 #include "qapi/qapi-types-misc-target.h"
 #include "standard-headers/asm-x86/kvm_para.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/runstate-action.h"
 #include "sysemu/kvm.h"
 #include "sysemu/kvm_int.h"
 #include "sysemu/tdx.h"
@@ -363,6 +364,19 @@ static void tdx_guest_init(Object *obj)
 
 qemu_mutex_init(>lock);
 
+/*
+ * TDX module spec version 344425-002US doesn't support reset of vcpu by
+ * VMM.  VM needs to be destroyed and created again to emulate
+ * REBOOT_ACTION_RESET.  For simplicity, put its responsibility to
+ * management system like libvirt.
+ *
+ * Management system should
+ *  - set reboot_action to REBOOT_ACTION_SHUTDOWN
+ *  - set shutdown_action to SHUTDOWN_ACTION_PAUSE
+ *  - subscribe VM state and on reboot, destroy qemu and start new qemu
+ */
+reboot_action = REBOOT_ACTION_SHUTDOWN;
+
 tdx->debug = false;
 object_property_add_bool(obj, "debug", tdx_guest_get_debug,
  tdx_guest_set_debug);
-- 
2.25.1




[RFC PATCH v2 17/44] i386/tdx: Add definitions for TDVF metadata

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Add constants and structs for the TD Virtual Firmware metadata, which
describes how the TDVF must be built to ensure correct functionality and
measurement.  They are defined in TDVF Design Guide [1].

[1] TDVF Design Guide
https://software.intel.com/content/dam/develop/external/us/en/documents/tdx-virtual-firmware-design-guide-rev-1.pdf

Signed-off-by: Isaku Yamahata 
Co-developed-by: Sean Christopherson 
Signed-off-by: Sean Christopherson 
---
 include/hw/i386/tdvf.h | 55 ++
 1 file changed, 55 insertions(+)
 create mode 100644 include/hw/i386/tdvf.h

diff --git a/include/hw/i386/tdvf.h b/include/hw/i386/tdvf.h
new file mode 100644
index 00..5c78e2affb
--- /dev/null
+++ b/include/hw/i386/tdvf.h
@@ -0,0 +1,55 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+
+ * Copyright (c) 2020 Intel Corporation
+ * Author: Isaku Yamahata 
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_I386_TDVF_H
+#define HW_I386_TDVF_H
+
+#include "qemu/osdep.h"
+
+#define TDVF_METDATA_OFFSET_FROM_END0x20
+
+#define TDVF_SECTION_TYPE_BFV   0
+#define TDVF_SECTION_TYPE_CFV   1
+#define TDVF_SECTION_TYPE_TD_HOB2
+#define TDVF_SECTION_TYPE_TEMP_MEM  3
+
+#define TDVF_SECTION_ATTRIBUTES_EXTENDMR(1U << 0)
+
+typedef struct {
+uint32_t DataOffset;
+uint32_t RawDataSize;
+uint64_t MemoryAddress;
+uint64_t MemoryDataSize;
+uint32_t Type;
+uint32_t Attributes;
+} TdvfSectionEntry;
+
+#define TDVF_SIGNATURE_LE32 0x46564454 /* TDVF as little endian */
+
+typedef struct {
+uint8_t Signature[4];
+uint32_t Length;
+uint32_t Version;
+uint32_t NumberOfSectionEntries;
+TdvfSectionEntry SectionEntries[];
+} TdvfMetadata;
+
+#endif /* HW_I386_TDVF_H */
-- 
2.25.1




[RFC PATCH v2 07/44] i386/kvm: Squash getting/putting guest state for TDX VMs

2021-07-07 Thread isaku . yamahata
From: Sean Christopherson 

Ignore get/put state of TDX VMs as accessing/mutating guest state of
producation TDs is not supported.
Allow kvm_arch_get_registers() to run as normal, except for MSRs, for
debug TDs, and silently ignores attempts to read guest state for
non-debug TDs.

Signed-off-by: Sean Christopherson 
Signed-off-by: Isaku Yamahata 
---
 target/i386/kvm/kvm.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index a3d5b334d1..27b64dedc2 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -2641,6 +2641,11 @@ void kvm_put_apicbase(X86CPU *cpu, uint64_t value)
 {
 int ret;
 
+/* TODO: Allow accessing guest state for debug TDs. */
+if (vm_type == KVM_X86_TDX_VM) {
+return;
+}
+
 ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value);
 assert(ret == 1);
 }
@@ -4099,6 +4104,11 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
 
 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
 
+/* TODO: Allow accessing guest state for debug TDs. */
+if (vm_type == KVM_X86_TDX_VM) {
+return 0;
+}
+
 /* must be before kvm_put_nested_state so that EFER.SVME is set */
 ret = kvm_put_sregs(x86_cpu);
 if (ret < 0) {
@@ -4209,9 +4219,11 @@ int kvm_arch_get_registers(CPUState *cs)
 if (ret < 0) {
 goto out;
 }
-ret = kvm_get_msrs(cpu);
-if (ret < 0) {
-goto out;
+if (vm_type != KVM_X86_TDX_VM) {
+ret = kvm_get_msrs(cpu);
+if (ret < 0) {
+goto out;
+}
 }
 ret = kvm_get_apic(cpu);
 if (ret < 0) {
-- 
2.25.1




[RFC PATCH v2 15/44] i386/tdx: Add hook to require generic device loader

2021-07-07 Thread isaku . yamahata
From: Sean Christopherson 

Add a hook for TDX to denote that the TD Virtual Firmware must be
provided via the "generic" device loader.  Error out if pflash is used
in conjuction with TDX.

Suggested-by: Isaku Yamahata 
Signed-off-by: Sean Christopherson 
Signed-off-by: Isaku Yamahata 
---
 hw/i386/pc_sysfw.c |  6 ++
 include/sysemu/tdx.h   |  2 ++
 target/i386/kvm/tdx-stub.c |  5 +
 target/i386/kvm/tdx.c  | 25 +
 4 files changed, 38 insertions(+)

diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index 6ce37a2b05..5ff571af36 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -38,6 +38,7 @@
 #include "hw/block/flash.h"
 #include "sysemu/kvm.h"
 #include "sysemu/sev.h"
+#include "sysemu/tdx.h"
 
 #define FLASH_SECTOR_SIZE 4096
 
@@ -328,6 +329,11 @@ void pc_system_firmware_init(PCMachineState *pcms,
 int i;
 BlockBackend *pflash_blk[ARRAY_SIZE(pcms->flash)];
 
+if (!tdx_system_firmware_init(pcms, rom_memory)) {
+pc_system_flash_cleanup_unused(pcms);
+return;
+}
+
 if (!pcmc->pci_enabled) {
 x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, true);
 return;
diff --git a/include/sysemu/tdx.h b/include/sysemu/tdx.h
index 03461b6ae8..70eb01348f 100644
--- a/include/sysemu/tdx.h
+++ b/include/sysemu/tdx.h
@@ -3,8 +3,10 @@
 
 #ifndef CONFIG_USER_ONLY
 #include "sysemu/kvm.h"
+#include "hw/i386/pc.h"
 
 bool kvm_has_tdx(KVMState *s);
+int tdx_system_firmware_init(PCMachineState *pcms, MemoryRegion *rom_memory);
 #endif
 
 void tdx_pre_create_vcpu(CPUState *cpu);
diff --git a/target/i386/kvm/tdx-stub.c b/target/i386/kvm/tdx-stub.c
index 93afe07ddb..4e1a0a4280 100644
--- a/target/i386/kvm/tdx-stub.c
+++ b/target/i386/kvm/tdx-stub.c
@@ -7,6 +7,11 @@ bool kvm_has_tdx(KVMState *s)
 {
 return false;
 }
+
+int tdx_system_firmware_init(PCMachineState *pcms, MemoryRegion *rom_memory)
+{
+return -ENOSYS;
+}
 #endif
 
 void tdx_pre_create_vcpu(CPUState *cpu)
diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index 67fb03b4b5..48c04d344d 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -109,6 +109,31 @@ int tdx_kvm_init(ConfidentialGuestSupport *cgs, Error 
**errp)
 return 0;
 }
 
+int tdx_system_firmware_init(PCMachineState *pcms, MemoryRegion *rom_memory)
+{
+MachineState *ms = MACHINE(pcms);
+TdxGuest *tdx = (TdxGuest *)object_dynamic_cast(OBJECT(ms->cgs),
+TYPE_TDX_GUEST);
+int i;
+
+if (!tdx) {
+return -ENOSYS;
+}
+
+/*
+ * Sanitiy check for tdx:
+ * TDX uses generic loader to load bios instead of pflash.
+ */
+for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
+if (drive_get(IF_PFLASH, 0, i)) {
+error_report("pflash not supported by VM type, "
+ "use -device loader,file=");
+exit(1);
+}
+}
+return 0;
+}
+
 void tdx_get_supported_cpuid(KVMState *s, uint32_t function,
  uint32_t index, int reg, uint32_t *ret)
 {
-- 
2.25.1




[RFC PATCH v2 38/44] hw/i386: plug eoi_intercept_unsupported to ioapic

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

When x86machine doesn't support eoi intercept, set
level_trigger_unsupported property of ioapic to true so that ioapic doesn't
accept configuration to use level trigger.

Signed-off-by: Isaku Yamahata 
---
 hw/i386/microvm.c |  5 +++--
 hw/i386/pc_piix.c |  2 +-
 hw/i386/pc_q35.c  |  2 +-
 hw/i386/x86.c | 10 --
 include/hw/i386/x86.h |  6 --
 5 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
index aba0c83219..9b03d051ca 100644
--- a/hw/i386/microvm.c
+++ b/hw/i386/microvm.c
@@ -175,9 +175,10 @@ static void microvm_devices_init(MicrovmMachineState *mms)
   _abort);
 isa_bus_irqs(isa_bus, x86ms->gsi);
 
-ioapic_init_gsi(gsi_state, "machine");
+ioapic_init_gsi(gsi_state, "machine", x86ms->eoi_intercept_unsupported);
 if (ioapics > 1) {
-x86ms->ioapic2 = ioapic_init_secondary(gsi_state);
+x86ms->ioapic2 = ioapic_init_secondary(
+gsi_state, x86ms->eoi_intercept_unsupported);
 }
 
 kvmclock_create(true);
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 4c1e31f180..a601c4a916 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -223,7 +223,7 @@ static void pc_init1(MachineState *machine,
 }
 
 if (pcmc->pci_enabled) {
-ioapic_init_gsi(gsi_state, "i440fx");
+ioapic_init_gsi(gsi_state, "i440fx", x86ms->eoi_intercept_unsupported);
 }
 
 if (tcg_enabled()) {
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 106f5726cc..464463766c 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -256,7 +256,7 @@ static void pc_q35_init(MachineState *machine)
 }
 
 if (pcmc->pci_enabled) {
-ioapic_init_gsi(gsi_state, "q35");
+ioapic_init_gsi(gsi_state, "q35", x86ms->eoi_intercept_unsupported);
 }
 
 if (tcg_enabled()) {
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index 9862fe5bc9..88c365b72d 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -608,7 +608,8 @@ void gsi_handler(void *opaque, int n, int level)
 }
 }
 
-void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name)
+void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name,
+ bool level_trigger_unsupported)
 {
 DeviceState *dev;
 SysBusDevice *d;
@@ -622,6 +623,8 @@ void ioapic_init_gsi(GSIState *gsi_state, const char 
*parent_name)
 }
 object_property_add_child(object_resolve_path(parent_name, NULL),
   "ioapic", OBJECT(dev));
+object_property_set_bool(OBJECT(dev), "level_trigger_unsupported",
+ level_trigger_unsupported, NULL);
 d = SYS_BUS_DEVICE(dev);
 sysbus_realize_and_unref(d, _fatal);
 sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS);
@@ -631,13 +634,16 @@ void ioapic_init_gsi(GSIState *gsi_state, const char 
*parent_name)
 }
 }
 
-DeviceState *ioapic_init_secondary(GSIState *gsi_state)
+DeviceState *ioapic_init_secondary(GSIState *gsi_state,
+   bool level_trigger_unsupported)
 {
 DeviceState *dev;
 SysBusDevice *d;
 unsigned int i;
 
 dev = qdev_new(TYPE_IOAPIC);
+object_property_set_bool(OBJECT(dev), "level_trigger_unsupported",
+ level_trigger_unsupported, NULL);
 d = SYS_BUS_DEVICE(dev);
 sysbus_realize_and_unref(d, _fatal);
 sysbus_mmio_map(d, 0, IO_APIC_SECONDARY_ADDRESS);
diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h
index 6eff42550f..7536e5fb8c 100644
--- a/include/hw/i386/x86.h
+++ b/include/hw/i386/x86.h
@@ -140,7 +140,9 @@ typedef struct GSIState {
 
 qemu_irq x86_allocate_cpu_irq(void);
 void gsi_handler(void *opaque, int n, int level);
-void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name);
-DeviceState *ioapic_init_secondary(GSIState *gsi_state);
+void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name,
+ bool eoi_intercept_unsupported);
+DeviceState *ioapic_init_secondary(GSIState *gsi_state,
+   bool eoi_intercept_unsupported);
 
 #endif
-- 
2.25.1




[RFC PATCH v2 32/44] tdx: add kvm_tdx_enabled() accessor for later use

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Signed-off-by: Isaku Yamahata 
---
 include/sysemu/tdx.h  | 1 +
 target/i386/kvm/kvm.c | 5 +
 2 files changed, 6 insertions(+)

diff --git a/include/sysemu/tdx.h b/include/sysemu/tdx.h
index 70eb01348f..f3eced10f9 100644
--- a/include/sysemu/tdx.h
+++ b/include/sysemu/tdx.h
@@ -6,6 +6,7 @@
 #include "hw/i386/pc.h"
 
 bool kvm_has_tdx(KVMState *s);
+bool kvm_tdx_enabled(void);
 int tdx_system_firmware_init(PCMachineState *pcms, MemoryRegion *rom_memory);
 #endif
 
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index af6b5f350e..76c3ea9fac 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -152,6 +152,11 @@ int kvm_set_vm_type(MachineState *ms, int kvm_type)
 return -ENOTSUP;
 }
 
+bool kvm_tdx_enabled(void)
+{
+return vm_type == KVM_X86_TDX_VM;
+}
+
 int kvm_has_pit_state2(void)
 {
 return has_pit_state2;
-- 
2.25.1




[RFC PATCH v2 36/44] hw/i386: add eoi_intercept_unsupported member to X86MachineState

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Add a new bool member, eoi_intercept_unsupported, to X86MachineState with
default value false.  Set true when tdx kvm type.  Inability to intercept
eoi causes impossibility to emulate level triggered interrupt to be
re-injected when level is still kept active.  which affects interrupt
controller emulation. Such new behavior will be introduced later.

Signed-off-by: Isaku Yamahata 
---
 hw/i386/x86.c | 1 +
 include/hw/i386/x86.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index ed15f6f2cf..9862fe5bc9 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -1311,6 +1311,7 @@ static void x86_machine_initfn(Object *obj)
 x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6);
 x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8);
 x86ms->bus_lock_ratelimit = 0;
+x86ms->eoi_intercept_unsupported = false;
 
 object_property_add_str(obj, "kvm-type",
 x86_get_kvm_type, x86_set_kvm_type);
diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h
index a450b5e226..6eff42550f 100644
--- a/include/hw/i386/x86.h
+++ b/include/hw/i386/x86.h
@@ -63,6 +63,7 @@ struct X86MachineState {
 unsigned pci_irq_mask;
 unsigned apic_id_limit;
 uint16_t boot_cpus;
+bool eoi_intercept_unsupported;
 
 OnOffAuto smm;
 OnOffAuto acpi;
-- 
2.25.1




[RFC PATCH v2 16/44] hw/i386: Add definitions from UEFI spec for volumes, resources, etc...

2021-07-07 Thread isaku . yamahata
From: Isaku Yamahata 

Add definitions for literals, enums, structs, GUIDs, etc... that will be
used by TDX to build the UEFI Hand-Off Block (HOB) that is passed to the
Trusted Domain Virtual Firmware (TDVF).  All values come from the UEFI
specification and TDVF design guide. [1]

Note: EFI_RESOURCE_ATTRIBUTE_{ENCRYPTED, UNACCEPTED}, will be added
in future UEFI spec.

[1] 
https://software.intel.com/content/dam/develop/external/us/en/documents/tdx-virtual-firmware-design-guide-rev-1.pdf

Signed-off-by: Isaku Yamahata 
---
 hw/i386/uefi.h | 496 +
 1 file changed, 496 insertions(+)
 create mode 100644 hw/i386/uefi.h

diff --git a/hw/i386/uefi.h b/hw/i386/uefi.h
new file mode 100644
index 00..72bfc2f6a9
--- /dev/null
+++ b/hw/i386/uefi.h
@@ -0,0 +1,496 @@
+/*
+ * Copyright (C) 2020 Intel Corporation
+ *
+ * Author: Isaku Yamahata 
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef HW_I386_UEFI_H
+#define HW_I386_UEFI_H
+
+/***/
+/*
+ * basic EFI definitions
+ * supplemented with UEFI Specification Version 2.8 (Errata A)
+ * released February 2020
+ */
+/* UEFI integer is little endian */
+
+typedef struct {
+uint32_t Data1;
+uint16_t Data2;
+uint16_t Data3;
+uint8_t Data4[8];
+} EFI_GUID;
+
+typedef uint64_t EFI_PHYSICAL_ADDRESS;
+typedef uint32_t EFI_BOOT_MODE;
+
+typedef enum {
+EfiReservedMemoryType,
+EfiLoaderCode,
+EfiLoaderData,
+EfiBootServicesCode,
+EfiBootServicesData,
+EfiRuntimeServicesCode,
+EfiRuntimeServicesData,
+EfiConventionalMemory,
+EfiUnusableMemory,
+EfiACPIReclaimMemory,
+EfiACPIMemoryNVS,
+EfiMemoryMappedIO,
+EfiMemoryMappedIOPortSpace,
+EfiPalCode,
+EfiPersistentMemory,
+EfiMaxMemoryType
+} EFI_MEMORY_TYPE;
+
+
+/*
+ * data structure firmware volume/file
+ * based on
+ * UEFI Platform Initialization Specification Version 1.7. vol 3, 3.2.1
+ */
+
+#define SIGNATURE_16(A, B)(((A) | (B << 8)))
+#define SIGNATURE_32(A, B, C, D)  (((A) | (B << 8) | (C << 16) | (D << 24)))
+#define SIGNATURE_64(A, B, C, D, E, F, G, H)\
+(SIGNATURE_32(A, B, C, D) | ((uint64_t)(SIGNATURE_32(E, F, G, H)) << 32))
+
+/***/
+/* Firmware Volume format */
+
+typedef uint32_t EFI_FV_FILE_ATTRIBUTES;
+
+
+#define EFI_FV_FILE_ATTRIB_ALIGNMENT 0x001F
+#define EFI_FV_FILE_ATTRIB_FIXED 0x0100
+#define EFI_FV_FILE_ATTRIB_MEMORY_MAPPED 0x0200
+
+typedef uint32_t EFI_FVB_ATTRIBUTES_2;
+
+
+#define EFI_FVB2_READ_DISABLED_CAP  0x0001
+#define EFI_FVB2_READ_ENABLED_CAP   0x0002
+#define EFI_FVB2_READ_STATUS0x0004
+#define EFI_FVB2_WRITE_DISABLED_CAP 0x0008
+#define EFI_FVB2_WRITE_ENABLED_CAP  0x0010
+#define EFI_FVB2_WRITE_STATUS   0x0020
+#define EFI_FVB2_LOCK_CAP   0x0040
+#define EFI_FVB2_LOCK_STATUS0x0080
+#define EFI_FVB2_STICKY_WRITE   0x0200
+#define EFI_FVB2_MEMORY_MAPPED  0x0400
+#define EFI_FVB2_ERASE_POLARITY 0x0800
+#define EFI_FVB2_READ_LOCK_CAP  0x1000
+#define EFI_FVB2_READ_LOCK_STATUS   0x2000
+#define EFI_FVB2_WRITE_LOCK_CAP 0x4000
+#define EFI_FVB2_WRITE_LOCK_STATUS  0x8000
+#define EFI_FVB2_ALIGNMENT  0x001F
+#define EFI_FVB2_WEAK_ALIGNMENT 0x8000
+#define EFI_FVB2_ALIGNMENT_10x
+#define EFI_FVB2_ALIGNMENT_20x0001
+#define EFI_FVB2_ALIGNMENT_40x0002
+#define EFI_FVB2_ALIGNMENT_80x0003
+#define EFI_FVB2_ALIGNMENT_16   0x0004
+#define EFI_FVB2_ALIGNMENT_32   0x0005
+#define EFI_FVB2_ALIGNMENT_64   0x0006
+#define EFI_FVB2_ALIGNMENT_128  0x0007
+#define EFI_FVB2_ALIGNMENT_256  0x0008
+#define EFI_FVB2_ALIGNMENT_512  0x0009
+#define EFI_FVB2_ALIGNMENT_1K   0x000A
+#define EFI_FVB2_ALIGNMENT_2K   0x000B
+#define EFI_FVB2_ALIGNMENT_4K   0x000C
+#define EFI_FVB2_ALIGNMENT_8K   0x000D
+#define EFI_FVB2_ALIGNMENT_16K  0x000E
+#define EFI_FVB2_ALIGNMENT_32K  0x000F
+#define EFI_FVB2_ALIGNMENT_64K  0x0010
+#define EFI_FVB2_AL

  1   2   3   4   5   6   7   8   9   10   >