[Xen-devel] [PATCH v3 2/3] xen/pt: Pass the whole msi addr/data to Xen

2017-11-16 Thread Chao Gao
Previously, some fields (reserved or unalterable) are filtered by
Qemu. This fields are useless for the legacy interrupt format.
However, these fields are may meaningful (for intel platform)
for the interrupt of remapping format. It is better to pass the whole
msi addr/data to Xen without any filtering.

The main reason why we want this is QEMU doesn't have the knowledge
to decide the interrupt format after we introduce vIOMMU inside Xen.
Passing the whole msi message down and let arch-specific vIOMMU to
decide the interrupt format.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
v3:
 - new
---
 hw/xen/xen_pt_msi.c | 47 ---
 1 file changed, 12 insertions(+), 35 deletions(-)

diff --git a/hw/xen/xen_pt_msi.c b/hw/xen/xen_pt_msi.c
index 6d1e3bd..f7d6e76 100644
--- a/hw/xen/xen_pt_msi.c
+++ b/hw/xen/xen_pt_msi.c
@@ -47,25 +47,6 @@ static inline uint32_t msi_ext_dest_id(uint32_t addr_hi)
 return addr_hi & 0xff00;
 }
 
-static uint32_t msi_gflags(uint32_t data, uint64_t addr)
-{
-uint32_t result = 0;
-int rh, dm, dest_id, deliv_mode, trig_mode;
-
-rh = (addr >> MSI_ADDR_REDIRECTION_SHIFT) & 0x1;
-dm = (addr >> MSI_ADDR_DEST_MODE_SHIFT) & 0x1;
-dest_id = msi_dest_id(addr);
-deliv_mode = (data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x7;
-trig_mode = (data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
-
-result = dest_id | (rh << XEN_PT_GFLAGS_SHIFT_RH)
-| (dm << XEN_PT_GFLAGS_SHIFT_DM)
-| (deliv_mode << XEN_PT_GFLAGSSHIFT_DELIV_MODE)
-| (trig_mode << XEN_PT_GFLAGSSHIFT_TRG_MODE);
-
-return result;
-}
-
 static inline uint64_t msi_addr64(XenPTMSI *msi)
 {
 return (uint64_t)msi->addr_hi << 32 | msi->addr_lo;
@@ -160,23 +141,20 @@ static int msi_msix_update(XenPCIPassthroughState *s,
bool masked)
 {
 PCIDevice *d = >dev;
-uint8_t gvec = msi_vector(data);
-uint32_t gflags = msi_gflags(data, addr);
+uint32_t gflags = masked ? 0 : (1u << XEN_PT_GFLAGSSHIFT_UNMASKED);
 int rc = 0;
 uint64_t table_addr = 0;
 
-XEN_PT_LOG(d, "Updating MSI%s with pirq %d gvec %#x gflags %#x"
-   " (entry: %#x)\n",
-   is_msix ? "-X" : "", pirq, gvec, gflags, msix_entry);
+XEN_PT_LOG(d, "Updating MSI%s with pirq %d gvec %#x addr %"PRIx64
+   " data %#x gflags %#x (entry: %#x)\n",
+   is_msix ? "-X" : "", pirq, addr, data, gflags, msix_entry);
 
 if (is_msix) {
 table_addr = s->msix->mmio_base_addr;
 }
 
-gflags |= masked ? 0 : (1u << XEN_PT_GFLAGSSHIFT_UNMASKED);
-
-rc = xc_domain_update_msi_irq(xen_xc, xen_domid, gvec,
-  pirq, gflags, table_addr);
+rc = xc_domain_update_msi_irq(xen_xc, xen_domid, pirq, addr,
+  data, gflags, table_addr);
 
 if (rc) {
 XEN_PT_ERR(d, "Updating of MSI%s failed. (err: %d)\n",
@@ -199,8 +177,6 @@ static int msi_msix_disable(XenPCIPassthroughState *s,
 bool is_binded)
 {
 PCIDevice *d = >dev;
-uint8_t gvec = msi_vector(data);
-uint32_t gflags = msi_gflags(data, addr);
 int rc = 0;
 
 if (pirq == XEN_PT_UNASSIGNED_PIRQ) {
@@ -208,12 +184,13 @@ static int msi_msix_disable(XenPCIPassthroughState *s,
 }
 
 if (is_binded) {
-XEN_PT_LOG(d, "Unbind MSI%s with pirq %d, gvec %#x\n",
-   is_msix ? "-X" : "", pirq, gvec);
-rc = xc_domain_unbind_msi_irq(xen_xc, xen_domid, gvec, pirq, gflags);
+XEN_PT_LOG(d, "Unbind MSI%s with pirq %d, addr %"PRIx64", data %#x\n",
+   is_msix ? "-X" : "", pirq, addr, data);
+rc = xc_domain_unbind_msi_irq(xen_xc, xen_domid, pirq, addr, data);
 if (rc) {
-XEN_PT_ERR(d, "Unbinding of MSI%s failed. (err: %d, pirq: %d, 
gvec: %#x)\n",
-   is_msix ? "-X" : "", errno, pirq, gvec);
+XEN_PT_ERR(d, "Unbinding of MSI%s failed. (err: %d, pirq: %d, "
+   "addr: %"PRIx64", data: %#x)\n",
+   is_msix ? "-X" : "", errno, pirq, addr, data);
 return rc;
 }
 }
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 28/28] tools/libxc: Add viommu operations in libxc

2017-11-16 Thread Chao Gao
Add libxc helpers for XEN_DOMCTL_viommu_op. Now, it has one sub-command
- create(): create a vIOMMU in Xen, given viommu type, register-set
location and capabilities

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
v4:
 - remove destroy() sub-command
v3:
 - Remove API for querying viommu capabilities
 - Remove pointless cast
 - Polish commit message
 - Coding style
---
 tools/libxc/Makefile  |  1 +
 tools/libxc/include/xenctrl.h |  3 +++
 tools/libxc/xc_viommu.c   | 51 +++
 3 files changed, 55 insertions(+)
 create mode 100644 tools/libxc/xc_viommu.c

diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile
index 9a019e8..7d8c4b4 100644
--- a/tools/libxc/Makefile
+++ b/tools/libxc/Makefile
@@ -51,6 +51,7 @@ CTRL_SRCS-$(CONFIG_MiniOS) += xc_minios.c
 CTRL_SRCS-y   += xc_evtchn_compat.c
 CTRL_SRCS-y   += xc_gnttab_compat.c
 CTRL_SRCS-y   += xc_devicemodel_compat.c
+CTRL_SRCS-y   += xc_viommu.c
 
 GUEST_SRCS-y :=
 GUEST_SRCS-y += xg_private.c xc_suspend.c
diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 8ade90c..69cf03f 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2537,6 +2537,9 @@ enum xc_static_cpu_featuremask {
 const uint32_t *xc_get_static_cpu_featuremask(enum xc_static_cpu_featuremask);
 const uint32_t *xc_get_feature_deep_deps(uint32_t feature);
 
+int xc_viommu_create(xc_interface *xch, domid_t dom, uint64_t type,
+ uint64_t base_addr, uint64_t cap, uint32_t *viommu_id);
+
 #endif
 
 int xc_livepatch_upload(xc_interface *xch,
diff --git a/tools/libxc/xc_viommu.c b/tools/libxc/xc_viommu.c
new file mode 100644
index 000..a72b2f4
--- /dev/null
+++ b/tools/libxc/xc_viommu.c
@@ -0,0 +1,51 @@
+/*
+ * xc_viommu.c
+ *
+ * viommu related API functions.
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License, version 2.1, as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "xc_private.h"
+
+int xc_viommu_create(xc_interface *xch, domid_t dom, uint64_t type,
+ uint64_t base_addr, uint64_t cap, uint32_t *viommu_id)
+{
+int rc;
+DECLARE_DOMCTL;
+
+domctl.cmd = XEN_DOMCTL_viommu_op;
+domctl.domain = dom;
+domctl.u.viommu_op.cmd = XEN_DOMCTL_viommu_create;
+domctl.u.viommu_op.u.create.type = type;
+domctl.u.viommu_op.u.create.base_address = base_addr;
+domctl.u.viommu_op.u.create.capabilities = cap;
+
+rc = do_domctl(xch, );
+if ( !rc )
+*viommu_id = domctl.u.viommu_op.u.create.id;
+
+return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 24/28] tools/libacpi: Add new fields in acpi_config for DMAR table

2017-11-16 Thread Chao Gao
The BIOS reports the remapping hardware units in a platform to system software
through the DMA Remapping Reporting (DMAR) ACPI table.
New fields are introduced for DMAR table. These new fields are set by
toolstack through parsing guest's config file. construct_dmar() is added to
build DMAR table according to the new fields.

The header files in ovmf.c are re-ordered to avoid including  in
tools/libacpi/libacpi.h.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
v4:
 - initialize variables during declaration if possible
 - reorder the sequence of header files to avoid including 
 in tools/libacpi/libacpi.h

v3:
 - Remove chip-set specific IOAPIC BDF. Instead, let IOAPIC-related
 info be passed by struct acpi_config.
---
 tools/firmware/hvmloader/ovmf.c |  2 +-
 tools/libacpi/build.c   | 49 +
 tools/libacpi/libacpi.h |  9 
 3 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/tools/firmware/hvmloader/ovmf.c b/tools/firmware/hvmloader/ovmf.c
index a17a11c..606ab4d 100644
--- a/tools/firmware/hvmloader/ovmf.c
+++ b/tools/firmware/hvmloader/ovmf.c
@@ -23,10 +23,10 @@
 
 #include "config.h"
 #include "smbios_types.h"
+#include "util.h"
 #include "libacpi.h"
 #include "apic_regs.h"
 #include "../rombios/config.h"
-#include "util.h"
 #include "pci_regs.h"
 #include "hypercall.h"
 
diff --git a/tools/libacpi/build.c b/tools/libacpi/build.c
index f9881c9..bd759da 100644
--- a/tools/libacpi/build.c
+++ b/tools/libacpi/build.c
@@ -303,6 +303,55 @@ static struct acpi_20_slit *construct_slit(struct 
acpi_ctxt *ctxt,
 return slit;
 }
 
+/*
+ * Only one DMA remapping hardware unit is exposed and all devices
+ * are under the remapping hardware unit. I/O APIC should be explicitly
+ * enumerated.
+ */
+struct acpi_dmar *construct_dmar(struct acpi_ctxt *ctxt,
+ const struct acpi_config *config)
+{
+struct acpi_dmar_hardware_unit *drhd;
+struct dmar_device_scope *scope;
+unsigned int ioapic_scope_size = sizeof(*scope) + sizeof(scope->path[0]);
+unsigned int size = sizeof(struct acpi_dmar) + sizeof(*drhd) +
+ioapic_scope_size;
+struct acpi_dmar *dmar = ctxt->mem_ops.alloc(ctxt, size, 16);
+
+if ( !dmar )
+return NULL;
+
+memset(dmar, 0, size);
+dmar->header.signature = ACPI_2_0_DMAR_SIGNATURE;
+dmar->header.revision = ACPI_2_0_DMAR_REVISION;
+dmar->header.length = size;
+fixed_strcpy(dmar->header.oem_id, ACPI_OEM_ID);
+fixed_strcpy(dmar->header.oem_table_id, ACPI_OEM_TABLE_ID);
+dmar->header.oem_revision = ACPI_OEM_REVISION;
+dmar->header.creator_id   = ACPI_CREATOR_ID;
+dmar->header.creator_revision = ACPI_CREATOR_REVISION;
+dmar->host_address_width = config->host_addr_width - 1;
+if ( config->iommu_intremap_supported )
+dmar->flags |= ACPI_DMAR_INTR_REMAP;
+
+drhd = (struct acpi_dmar_hardware_unit *)((void*)dmar + sizeof(*dmar));
+drhd->type = ACPI_DMAR_TYPE_DRHD;
+drhd->length = sizeof(*drhd) + ioapic_scope_size;
+drhd->flags = ACPI_DMAR_INCLUDE_PCI_ALL;
+drhd->pci_segment = 0;
+drhd->base_address = config->iommu_base_addr;
+
+scope = >scope[0];
+scope->type = ACPI_DMAR_DEV_SCOPE_IOAPIC;
+scope->length = ioapic_scope_size;
+scope->enumeration_id = config->ioapic_id;
+scope->bus = config->ioapic_bus;
+scope->path[0] = config->ioapic_devfn;
+
+set_checksum(dmar, offsetof(struct acpi_header, checksum), size);
+return dmar;
+}
+
 static int construct_passthrough_tables(struct acpi_ctxt *ctxt,
 unsigned long *table_ptrs,
 int nr_tables,
diff --git a/tools/libacpi/libacpi.h b/tools/libacpi/libacpi.h
index a2efd23..c09afdc 100644
--- a/tools/libacpi/libacpi.h
+++ b/tools/libacpi/libacpi.h
@@ -96,8 +96,17 @@ struct acpi_config {
 uint32_t ioapic_base_address;
 uint16_t pci_isa_irq_mask;
 uint8_t ioapic_id;
+
+/* Emulated IOMMU features, location and IOAPIC under the scope of IOMMU */
+bool iommu_intremap_supported;
+uint8_t host_addr_width;
+uint8_t ioapic_bus;
+uint16_t ioapic_devfn;
+uint64_t iommu_base_addr;
 };
 
+struct acpi_dmar *construct_dmar(struct acpi_ctxt *ctxt,
+ const struct acpi_config *config);
 int acpi_build_tables(struct acpi_ctxt *ctxt, struct acpi_config *config);
 
 #endif /* __LIBACPI_H__ */
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v3 3/3] msi: Handle remappable format interrupt request

2017-11-16 Thread Chao Gao
According to VT-d spec Interrupt Remapping and Interrupt Posting ->
Interrupt Remapping -> Interrupt Request Formats On Intel 64
Platforms, fields of MSI data register have changed. This patch
avoids wrongly regarding a remappable format interrupt request as
an interrupt binded with a pirq.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
v3:
 - clarify the interrupt format bit is Intel-specific, then it is
 improper to define MSI_ADDR_IF_MASK in a common header.
---
 hw/i386/xen/xen-hvm.c | 10 +-
 hw/pci/msi.c  |  5 +++--
 hw/pci/msix.c |  4 +++-
 hw/xen/xen_pt_msi.c   |  2 +-
 include/hw/xen/xen.h  |  2 +-
 stubs/xen-hvm.c   |  2 +-
 6 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/hw/i386/xen/xen-hvm.c b/hw/i386/xen/xen-hvm.c
index 8028bed..52dc8af 100644
--- a/hw/i386/xen/xen-hvm.c
+++ b/hw/i386/xen/xen-hvm.c
@@ -145,8 +145,16 @@ void xen_piix_pci_write_config_client(uint32_t address, 
uint32_t val, int len)
 }
 }
 
-int xen_is_pirq_msi(uint32_t msi_data)
+int xen_is_pirq_msi(uint32_t msi_addr_lo, uint32_t msi_data)
 {
+/* If the MSI address is configured in remapping format, the MSI will not
+ * be remapped into a pirq. This 'if' test excludes Intel-specific
+ * remappable msi.
+ */
+#define MSI_ADDR_IF_MASK 0x0010
+if (msi_addr_lo & MSI_ADDR_IF_MASK) {
+return 0;
+}
 /* If vector is 0, the msi is remapped into a pirq, passed as
  * dest_id.
  */
diff --git a/hw/pci/msi.c b/hw/pci/msi.c
index 5e05ce5..d05c876 100644
--- a/hw/pci/msi.c
+++ b/hw/pci/msi.c
@@ -289,7 +289,7 @@ void msi_reset(PCIDevice *dev)
 static bool msi_is_masked(const PCIDevice *dev, unsigned int vector)
 {
 uint16_t flags = pci_get_word(dev->config + msi_flags_off(dev));
-uint32_t mask, data;
+uint32_t mask, data, addr_lo;
 bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
 assert(vector < PCI_MSI_VECTORS_MAX);
 
@@ -298,7 +298,8 @@ static bool msi_is_masked(const PCIDevice *dev, unsigned 
int vector)
 }
 
 data = pci_get_word(dev->config + msi_data_off(dev, msi64bit));
-if (xen_is_pirq_msi(data)) {
+addr_lo = pci_get_long(dev->config + msi_address_lo_off(dev));
+if (xen_is_pirq_msi(addr_lo, data)) {
 return false;
 }
 
diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index c944c02..4cb01db 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -83,9 +83,11 @@ static bool msix_vector_masked(PCIDevice *dev, unsigned int 
vector, bool fmask)
 {
 unsigned offset = vector * PCI_MSIX_ENTRY_SIZE;
 uint8_t *data = >msix_table[offset + PCI_MSIX_ENTRY_DATA];
+uint8_t *addr_lo = >msix_table[offset + PCI_MSIX_ENTRY_LOWER_ADDR];
 /* MSIs on Xen can be remapped into pirqs. In those cases, masking
  * and unmasking go through the PV evtchn path. */
-if (xen_enabled() && xen_is_pirq_msi(pci_get_long(data))) {
+if (xen_enabled() && xen_is_pirq_msi(pci_get_long(addr_lo),
+ pci_get_long(data))) {
 return false;
 }
 return fmask || dev->msix_table[offset + PCI_MSIX_ENTRY_VECTOR_CTRL] &
diff --git a/hw/xen/xen_pt_msi.c b/hw/xen/xen_pt_msi.c
index f7d6e76..0e5bf83 100644
--- a/hw/xen/xen_pt_msi.c
+++ b/hw/xen/xen_pt_msi.c
@@ -96,7 +96,7 @@ static int msi_msix_setup(XenPCIPassthroughState *s,
 
 assert((!is_msix && msix_entry == 0) || is_msix);
 
-if (xen_is_pirq_msi(data)) {
+if (xen_is_pirq_msi(addr, data)) {
 *ppirq = msi_ext_dest_id(addr >> 32) | msi_dest_id(addr);
 if (!*ppirq) {
 /* this probably identifies an misconfiguration of the guest,
diff --git a/include/hw/xen/xen.h b/include/hw/xen/xen.h
index 7efcdaa..0d6c83e 100644
--- a/include/hw/xen/xen.h
+++ b/include/hw/xen/xen.h
@@ -34,7 +34,7 @@ int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num);
 void xen_piix3_set_irq(void *opaque, int irq_num, int level);
 void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int len);
 void xen_hvm_inject_msi(uint64_t addr, uint32_t data);
-int xen_is_pirq_msi(uint32_t msi_data);
+int xen_is_pirq_msi(uint32_t msi_addr_lo, uint32_t msi_data);
 
 qemu_irq *xen_interrupt_controller_init(void);
 
diff --git a/stubs/xen-hvm.c b/stubs/xen-hvm.c
index 3ca6c51..aeb1592 100644
--- a/stubs/xen-hvm.c
+++ b/stubs/xen-hvm.c
@@ -31,7 +31,7 @@ void xen_hvm_inject_msi(uint64_t addr, uint32_t data)
 {
 }
 
-int xen_is_pirq_msi(uint32_t msi_data)
+int xen_is_pirq_msi(uint32_t msi_addr_lo, uint32_t msi_data)
 {
 return 0;
 }
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 27/28] tools/libxl: create vIOMMU during domain construction

2017-11-16 Thread Chao Gao
If guest is configured to have a vIOMMU, create it during domain construction.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>

---
v4:
 - s/LOGED/LOGD
v3:
 - Remove the process of querying capabilities.
---
 tools/libxl/libxl_x86.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index cb2f494..394c70f 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -343,8 +343,25 @@ int libxl__arch_domain_create(libxl__gc *gc, 
libxl_domain_config *d_config,
 if (d_config->b_info.type != LIBXL_DOMAIN_TYPE_PV) {
 unsigned long shadow = DIV_ROUNDUP(d_config->b_info.shadow_memkb,
1024);
+unsigned int i;
+
 xc_shadow_control(ctx->xch, domid, XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION,
   NULL, 0, , 0, NULL);
+
+for (i = 0; i < d_config->b_info.num_viommus; i++) {
+uint32_t id;
+libxl_viommu_info *viommu = _config->b_info.viommu[i];
+
+if (viommu->type == LIBXL_VIOMMU_TYPE_INTEL_VTD) {
+ret = xc_viommu_create(ctx->xch, domid, VIOMMU_TYPE_INTEL_VTD,
+   viommu->base_addr, viommu->cap, );
+if (ret) {
+LOGD(ERROR, domid, "create vIOMMU fail (%d)", ret);
+ret = ERROR_FAIL;
+goto out;
+}
+}
+}
 }
 
 if (d_config->c_info.type == LIBXL_DOMAIN_TYPE_PV &&
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v3 0/3] Qemu: add Xen vIOMMU interrupt remapping function support

2017-11-16 Thread Chao Gao
This patchset is to deal with MSI interrupt remapping request when guest
updates MSI registers.

In case of conflicts, this series also can be found in my personal github:
Xen: https://github.com/gc1008/viommu_xen.git vIOMMU4
Qemu: https://github.com/gc1008/viommu_qemu.git vIOMMU3

Any comments would be highly appreciated. And below is the change histroy

Changes from v2:
In last version, a new interface is used for binding a guest remappable msi
with a physical interrupt, while the old interface is used for binding
non-remappable msi. But for AMD, only from the MSI message itself, the
interrupt format cannot be infered. To address this, we decide to pass the
whole guest msi message to Xen and let vIOMMUs in Xen detemine whether
an given interrupt is remappable or not.
So the following changes are made:
- Instead of introducing a new interface for binding remapping format msi,
the exist interface is modified to support msi of both format.
- In patch 3, define MSI_ADDR_IF_MASK inside a function because
it is intel-specific. It is improper to define it in a common header.

Chao Gao (3):
  i386/msi: Correct mask of destination ID in MSI address
  xen/pt: Pass the whole msi addr/data to Xen
  msi: Handle remappable format interrupt request

 hw/i386/xen/xen-hvm.c | 10 -
 hw/pci/msi.c  |  5 +++--
 hw/pci/msix.c |  4 +++-
 hw/xen/xen_pt_msi.c   | 49 ---
 include/hw/i386/apic-msidef.h |  2 +-
 include/hw/xen/xen.h  |  2 +-
 stubs/xen-hvm.c   |  2 +-
 7 files changed, 31 insertions(+), 43 deletions(-)

-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 26/28] tools/libxl: build DMAR table for a guest with one virtual VTD

2017-11-16 Thread Chao Gao
A new logic is added to init_acpi_config(). The logic initializes
some fields introduced for DMAR table. For PVH guest, the DMAR table
is built as other tables. But for HVM guest, only the DMAR table is
built in toolstack and pass through it to guest via existing mechanism.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
v4:
 - build DMAR table for PVH guest.
 - remove LIBXL_DEVICE_MODEL_VERSION_NONE

v3:
 - build dmar and initialize related acpi_modules struct in
 libxl_x86_acpi.c, keeping in accordance with pvh.
---
 tools/libacpi/build.c| 12 ++
 tools/libacpi/libacpi.h  |  1 +
 tools/libxl/libxl_x86.c  |  4 +-
 tools/libxl/libxl_x86_acpi.c | 98 ++--
 4 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/tools/libacpi/build.c b/tools/libacpi/build.c
index bd759da..df0a67c 100644
--- a/tools/libacpi/build.c
+++ b/tools/libacpi/build.c
@@ -517,6 +517,18 @@ static int construct_secondary_tables(struct acpi_ctxt 
*ctxt,
 printf("Failed to build SLIT, skipping...\n");
 }
 
+/* DMAR */
+if ( config->table_flags & ACPI_HAS_DMAR )
+{
+struct acpi_dmar *dmar = construct_dmar(ctxt, config);
+
+if ( dmar )
+table_ptrs[nr_tables++] = ctxt->mem_ops.v2p(ctxt, dmar);
+else
+printf("Failed to build DMAR, skipping...\n");
+}
+
+
 /* Load any additional tables passed through. */
 nr_tables += construct_passthrough_tables(ctxt, table_ptrs,
   nr_tables, config);
diff --git a/tools/libacpi/libacpi.h b/tools/libacpi/libacpi.h
index c09afdc..bdeeccc 100644
--- a/tools/libacpi/libacpi.h
+++ b/tools/libacpi/libacpi.h
@@ -36,6 +36,7 @@
 #define ACPI_HAS_8042  (1<<13)
 #define ACPI_HAS_CMOS_RTC  (1<<14)
 #define ACPI_HAS_SSDT_LAPTOP_SLATE (1<<15)
+#define ACPI_HAS_DMAR  (1<<16)
 
 struct xen_vmemrange;
 struct acpi_numa {
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index 5f91fe4..cb2f494 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -383,7 +383,9 @@ int libxl__arch_domain_finalise_hw_description(libxl__gc 
*gc,
 {
 int rc = 0;
 
-if (info->type == LIBXL_DOMAIN_TYPE_PVH) {
+
+if (info->type == LIBXL_DOMAIN_TYPE_HVM
+|| info->type == LIBXL_DOMAIN_TYPE_PVH) {
 rc = libxl__dom_load_acpi(gc, info, dom);
 if (rc != 0)
 LOGE(ERROR, "libxl_dom_load_acpi failed");
diff --git a/tools/libxl/libxl_x86_acpi.c b/tools/libxl/libxl_x86_acpi.c
index 9a7c904..bbe9219 100644
--- a/tools/libxl/libxl_x86_acpi.c
+++ b/tools/libxl/libxl_x86_acpi.c
@@ -16,6 +16,7 @@
 #include "libxl_arch.h"
 #include 
 #include 
+#include "libacpi/acpi2_0.h"
 #include "libacpi/libacpi.h"
 
 #include 
@@ -100,6 +101,25 @@ static int init_acpi_config(libxl__gc *gc,
 struct hvm_info_table *hvminfo;
 int i, r, rc;
 
+if ((b_info->num_viommus == 1) &&
+(b_info->viommu[0].type == LIBXL_VIOMMU_TYPE_INTEL_VTD)) {
+if (libxl_defbool_val(b_info->viommu[0].intremap))
+config->iommu_intremap_supported = true;
+config->iommu_base_addr = b_info->viommu[0].base_addr;
+
+/* IOAPIC id and PSEUDO BDF */
+config->ioapic_id = 1;
+config->ioapic_bus = 0xff;
+config->ioapic_devfn = 0x0;
+
+config->host_addr_width = 39;
+config->table_flags |= ACPI_HAS_DMAR;
+}
+
+if (b_info->type == LIBXL_DOMAIN_TYPE_HVM) {
+return 0;
+}
+
 config->dsdt_anycpu = config->dsdt_15cpu = dsdt_pvh;
 config->dsdt_anycpu_len = config->dsdt_15cpu_len = dsdt_pvh_len;
 
@@ -161,9 +181,9 @@ out:
 return rc;
 }
 
-int libxl__dom_load_acpi(libxl__gc *gc,
- const libxl_domain_build_info *b_info,
- struct xc_dom_image *dom)
+static int libxl__dom_load_acpi_pvh(libxl__gc *gc,
+const libxl_domain_build_info *b_info,
+struct xc_dom_image *dom)
 {
 struct acpi_config config = {0};
 struct libxl_acpi_ctxt libxl_ctxt;
@@ -235,6 +255,78 @@ out:
 return rc;
 }
 
+static void *acpi_memalign(struct acpi_ctxt *ctxt, uint32_t size,
+   uint32_t align)
+{
+int ret;
+void *ptr;
+
+ret = posix_memalign(, align, size);
+if (ret != 0 || !ptr)
+return NULL;
+
+return ptr;
+}
+
+/*
+ * For hvm, we don't need build acpi in libxl. Instead, it's built in 
hvmloader.
+ * But if one hvm has virtual VTD(s), we build DMAR table for it and joint this
+ * table with existing content in acpi_modules in order to employ HVM
+ * firmware pass-through mechanism to pass-through DMAR table.
+ */
+static int l

[Xen-devel] [PATCH v4 18/28] x86/vioapic: Hook interrupt delivery of vIOAPIC

2017-11-16 Thread Chao Gao
When irq remapping is enabled, IOAPIC Redirection Entry may be in remapping
format. If that, generate an irq_remapping_request and call the common
VIOMMU abstraction's callback to handle this interrupt request. Device
model is responsible for checking the request's validity.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>

---
v3:
 - use the new interface to check remapping format.
---
 xen/arch/x86/hvm/vioapic.c   | 9 +
 xen/include/asm-x86/viommu.h | 9 +
 2 files changed, 18 insertions(+)

diff --git a/xen/arch/x86/hvm/vioapic.c b/xen/arch/x86/hvm/vioapic.c
index 97b419f..0f20e3f 100644
--- a/xen/arch/x86/hvm/vioapic.c
+++ b/xen/arch/x86/hvm/vioapic.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -387,9 +388,17 @@ static void vioapic_deliver(struct hvm_vioapic *vioapic, 
unsigned int pin)
 struct vlapic *target;
 struct vcpu *v;
 unsigned int irq = vioapic->base_gsi + pin;
+struct arch_irq_remapping_request request;
 
 ASSERT(spin_is_locked(>arch.hvm_domain.irq_lock));
 
+irq_request_ioapic_fill(, vioapic->id, 
vioapic->redirtbl[pin].bits);
+if ( viommu_check_irq_remapping(d, ) )
+{
+viommu_handle_irq_request(d, );
+return;
+}
+
 HVM_DBG_LOG(DBG_LEVEL_IOAPIC,
 "dest=%x dest_mode=%x delivery_mode=%x "
 "vector=%x trig_mode=%x",
diff --git a/xen/include/asm-x86/viommu.h b/xen/include/asm-x86/viommu.h
index 3d995ba..e526e38 100644
--- a/xen/include/asm-x86/viommu.h
+++ b/xen/include/asm-x86/viommu.h
@@ -49,6 +49,15 @@ struct arch_irq_remapping_request
 enum viommu_irq_request_type type;
 };
 
+static inline void irq_request_ioapic_fill(
+struct arch_irq_remapping_request *req, uint32_t ioapic_id, uint64_t rte)
+{
+ASSERT(req);
+req->type = VIOMMU_REQUEST_IRQ_APIC;
+req->source_id = ioapic_id;
+req->msg.rte = rte;
+}
+
 #endif /* __ARCH_X86_VIOMMU_H__ */
 
 /*
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 14/28] x86/vvtd: Handle interrupt translation faults

2017-11-16 Thread Chao Gao
Interrupt translation faults are non-recoverable fault. When faults
are triggered, it needs to populate fault info to Fault Recording
Registers and inject msi interrupt to notify guest IOMMU driver
to deal with faults.

This patch emulates hardware's handling interrupt translation
faults (more information about the process can be found in VT-d spec,
chipter "Translation Faults", section "Non-Recoverable Fault
Reporting" and section "Non-Recoverable Logging").
Specifically, viommu_record_fault() records the fault information and
viommu_report_non_recoverable_fault() reports faults to software.
Currently, only Primary Fault Logging is supported and the Number of
Fault-recording Registers is 1.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>

---
v4:
 - introduce a lock to protect fault-event related regs
---
 xen/drivers/passthrough/vtd/iommu.h |  51 ++-
 xen/drivers/passthrough/vtd/vvtd.c  | 288 +++-
 2 files changed, 333 insertions(+), 6 deletions(-)

diff --git a/xen/drivers/passthrough/vtd/iommu.h 
b/xen/drivers/passthrough/vtd/iommu.h
index 82edd2a..dc2df75 100644
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -196,26 +196,67 @@
 #define DMA_CCMD_CAIG_MASK(x) (((u64)x) & ((u64) 0x3 << 59))
 
 /* FECTL_REG */
-#define DMA_FECTL_IM((uint32_t)1 << 31)
+#define DMA_FECTL_IM_SHIFT  31
+#define DMA_FECTL_IP_SHIFT  30
+#define DMA_FECTL_IM((uint32_t)1 << DMA_FECTL_IM_SHIFT)
+#define DMA_FECTL_IP((uint32_t)1 << DMA_FECTL_IP_SHIFT)
 
 /* FSTS_REG */
-#define DMA_FSTS_PFO((uint32_t)1 << 0)
-#define DMA_FSTS_PPF((uint32_t)1 << 1)
+#define DMA_FSTS_PFO_SHIFT  0
+#define DMA_FSTS_PPF_SHIFT  1
+#define DMA_FSTS_PRO_SHIFT  7
+
+#define DMA_FSTS_PFO((uint32_t)1 << DMA_FSTS_PFO_SHIFT)
+#define DMA_FSTS_PPF((uint32_t)1 << DMA_FSTS_PPF_SHIFT)
 #define DMA_FSTS_AFO((uint32_t)1 << 2)
 #define DMA_FSTS_APF((uint32_t)1 << 3)
 #define DMA_FSTS_IQE((uint32_t)1 << 4)
 #define DMA_FSTS_ICE((uint32_t)1 << 5)
 #define DMA_FSTS_ITE((uint32_t)1 << 6)
-#define DMA_FSTS_FAULTSDMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_AFO | 
DMA_FSTS_APF | DMA_FSTS_IQE | DMA_FSTS_ICE | DMA_FSTS_ITE
+#define DMA_FSTS_PRO((uint32_t)1 << DMA_FSTS_PRO_SHIFT)
+#define DMA_FSTS_FAULTS (DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_AFO | \
+ DMA_FSTS_APF | DMA_FSTS_IQE | DMA_FSTS_ICE | \
+ DMA_FSTS_ITE | DMA_FSTS_PRO)
+#define DMA_FSTS_RW1CS  (DMA_FSTS_PFO | DMA_FSTS_AFO | DMA_FSTS_APF | \
+ DMA_FSTS_IQE | DMA_FSTS_ICE | DMA_FSTS_ITE | \
+ DMA_FSTS_PRO)
 #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
 
 /* FRCD_REG, 32 bits access */
-#define DMA_FRCD_F (((u64)1) << 31)
+#define DMA_FRCD_LEN0x10
+#define DMA_FRCD2_OFFSET0x8
+#define DMA_FRCD3_OFFSET0xc
+#define DMA_FRCD_F_SHIFT31
+#define DMA_FRCD_F ((u64)1 << DMA_FRCD_F_SHIFT)
 #define dma_frcd_type(d) ((d >> 30) & 1)
 #define dma_frcd_fault_reason(c) (c & 0xff)
 #define dma_frcd_source_id(c) (c & 0x)
 #define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
 
+struct vtd_fault_record_register
+{
+union {
+struct {
+uint64_t lo;
+uint64_t hi;
+} bits;
+struct {
+uint64_t rsvd0  :12,
+ fault_info :52;
+uint64_t source_id  :16,
+ rsvd1  :9,
+ pmr:1,  /* Privilege Mode Requested */
+ exe:1,  /* Execute Permission Requested */
+ pasid_p:1,  /* PASID Present */
+ fault_reason   :8,  /* Fault Reason */
+ pasid_val  :20, /* PASID Value */
+ addr_type  :2,  /* Address Type */
+ type   :1,  /* Type. (0) Write (1) Read/AtomicOp 
*/
+ fault  :1;  /* Fault */
+} fields;
+};
+};
+
 /* Interrupt remapping transition faults */
 #define VTD_FR_IR_REQ_RSVD  0x20
 #define VTD_FR_IR_INDEX_OVER0x21
diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
b/xen/drivers/passthrough/vtd/vvtd.c
index d3dec01..83805d1 100644
--- a/xen/drivers/passthrough/vtd/vvtd.c
+++ b/xen/drivers/passthrough/vtd/vvtd.c
@@ -43,6 +43,7 @@
 struct hvm_hw_vvtd {
 bool eim_enabled;
 bool intremap_enabled;
+uint32_t fault_index;
 
 /* Interrupt remapping table base gfn and the max of entries */
 uint16_t irt_max_entry;
@@ -58,6 +59,12 @@ struct vvtd {
 struct domain *domain;
 /* # of in

[Xen-devel] [PATCH v4 13/28] x86/vvtd: add a helper function to decide the interrupt format

2017-11-16 Thread Chao Gao
Different platform may use different method to distinguish
remapping format interrupt and normal format interrupt.

Intel uses one bit in IOAPIC RTE or MSI address register to
indicate the interrupt is remapping format. vvtd should handle
all the interrupts when .check_irq_remapping() return true.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>

---
v3:
 - new
---
 xen/drivers/passthrough/vtd/vvtd.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
b/xen/drivers/passthrough/vtd/vvtd.c
index 9890cc2..d3dec01 100644
--- a/xen/drivers/passthrough/vtd/vvtd.c
+++ b/xen/drivers/passthrough/vtd/vvtd.c
@@ -565,6 +565,15 @@ static int vvtd_get_irq_info(const struct domain *d,
 return 0;
 }
 
+/* check whether the interrupt request is remappable */
+static bool vvtd_is_remapping(const struct domain *d,
+  const struct arch_irq_remapping_request *irq)
+{
+uint32_t idx;
+
+return !irq_remapping_request_index(irq, );
+}
+
 static void vvtd_reset(struct vvtd *vvtd)
 {
 uint64_t cap = cap_set_num_fault_regs(VVTD_FRCD_NUM)
@@ -628,6 +637,7 @@ static const struct viommu_ops vvtd_hvm_vmx_ops = {
 .destroy = vvtd_destroy,
 .handle_irq_request = vvtd_handle_irq_request,
 .get_irq_info = vvtd_get_irq_info,
+.check_irq_remapping = vvtd_is_remapping,
 };
 
 REGISTER_VIOMMU(vvtd_hvm_vmx_ops);
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 08/28] x86/vvtd: Add MMIO handler for VVTD

2017-11-16 Thread Chao Gao
This patch adds VVTD MMIO handler to deal with MMIO access.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
v4:
 - only trap the register emulated in vvtd_in_range().
   i.e. replace PAGE_SIZE with the VVTD_MAX_OFFSET
---
 xen/drivers/passthrough/vtd/vvtd.c | 55 ++
 1 file changed, 55 insertions(+)

diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
b/xen/drivers/passthrough/vtd/vvtd.c
index 9f76ccf..d78d878 100644
--- a/xen/drivers/passthrough/vtd/vvtd.c
+++ b/xen/drivers/passthrough/vtd/vvtd.c
@@ -94,6 +94,60 @@ static inline uint64_t vvtd_get_reg_quad(const struct vvtd 
*vvtd, uint32_t reg)
 return *(uint64_t*)VVTD_REG_POS(vvtd, reg);
 }
 
+static void *domain_vvtd(const struct domain *d)
+{
+if ( is_hvm_domain(d) && d->arch.hvm_domain.viommu )
+return d->arch.hvm_domain.viommu->priv;
+else
+return NULL;
+}
+
+static int vvtd_in_range(struct vcpu *v, unsigned long addr)
+{
+struct vvtd *vvtd = domain_vvtd(v->domain);
+
+if ( vvtd )
+return (addr >= vvtd->base_addr) &&
+   (addr < vvtd->base_addr + VVTD_MAX_OFFSET);
+return 0;
+}
+
+static int vvtd_read(struct vcpu *v, unsigned long addr,
+ unsigned int len, unsigned long *pval)
+{
+struct vvtd *vvtd = domain_vvtd(v->domain);
+unsigned int offset = addr - vvtd->base_addr;
+
+vvtd_info("Read offset %x len %d\n", offset, len);
+
+if ( (len != 4 && len != 8) || (offset & (len - 1)) )
+return X86EMUL_OKAY;
+
+if ( len == 4 )
+*pval = vvtd_get_reg(vvtd, offset);
+else
+*pval = vvtd_get_reg_quad(vvtd, offset);
+
+return X86EMUL_OKAY;
+}
+
+static int vvtd_write(struct vcpu *v, unsigned long addr,
+  unsigned int len, unsigned long val)
+{
+struct vvtd *vvtd = domain_vvtd(v->domain);
+unsigned int offset = addr - vvtd->base_addr;
+
+vvtd_info("Write offset %x len %d val %lx\n", offset, len, val);
+
+return X86EMUL_OKAY;
+}
+
+static const struct hvm_mmio_ops vvtd_mmio_ops = {
+.check = vvtd_in_range,
+.read = vvtd_read,
+.write = vvtd_write
+};
+
 static void vvtd_reset(struct vvtd *vvtd)
 {
 uint64_t cap = cap_set_num_fault_regs(VVTD_FRCD_NUM)
@@ -126,6 +180,7 @@ static int vvtd_create(struct domain *d, struct viommu 
*viommu)
 vvtd_reset(vvtd);
 vvtd->base_addr = viommu->base_address;
 vvtd->domain = d;
+register_mmio_handler(d, _mmio_ops);
 
 viommu->priv = vvtd;
 
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 20/28] xen/pt: when binding guest msi, accept the whole msi message

2017-11-16 Thread Chao Gao
... rather than a filtered one. Previously, some fields (reserved or
unalterable) are filtered by QEMU. These fields are useless for the
legacy interrupt format (i.e. non remappable format). However, these
fields are meaningful to remappable format. Accepting the whole msi
message will significantly reduce the efforts to support binding
remappable format msi.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
v4:
 - new
---
 tools/libxc/include/xenctrl.h |  7 ---
 tools/libxc/xc_domain.c   | 14 --
 xen/arch/x86/hvm/vmsi.c   | 12 ++--
 xen/drivers/passthrough/io.c  | 36 +---
 xen/include/asm-x86/hvm/irq.h |  5 +++--
 xen/include/public/domctl.h   |  8 ++--
 6 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 666db0b..8ade90c 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -1756,16 +1756,17 @@ int xc_domain_ioport_mapping(xc_interface *xch,
 int xc_domain_update_msi_irq(
 xc_interface *xch,
 uint32_t domid,
-uint32_t gvec,
 uint32_t pirq,
+uint64_t addr,
+uint32_t data,
 uint32_t gflags,
 uint64_t gtable);
 
 int xc_domain_unbind_msi_irq(xc_interface *xch,
  uint32_t domid,
- uint32_t gvec,
  uint32_t pirq,
- uint32_t gflags);
+ uint64_t addr,
+ uint32_t data);
 
 int xc_domain_bind_pt_irq(xc_interface *xch,
   uint32_t domid,
diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index 3ccd27f..f7baf11 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -1735,8 +1735,9 @@ int xc_deassign_dt_device(
 int xc_domain_update_msi_irq(
 xc_interface *xch,
 uint32_t domid,
-uint32_t gvec,
 uint32_t pirq,
+uint64_t addr,
+uint32_t data,
 uint32_t gflags,
 uint64_t gtable)
 {
@@ -1750,7 +1751,8 @@ int xc_domain_update_msi_irq(
 bind = &(domctl.u.bind_pt_irq);
 bind->irq_type = PT_IRQ_TYPE_MSI;
 bind->machine_irq = pirq;
-bind->u.msi.gvec = gvec;
+bind->u.msi.addr = addr;
+bind->u.msi.data = data;
 bind->u.msi.gflags = gflags;
 bind->u.msi.gtable = gtable;
 
@@ -1761,9 +1763,9 @@ int xc_domain_update_msi_irq(
 int xc_domain_unbind_msi_irq(
 xc_interface *xch,
 uint32_t domid,
-uint32_t gvec,
 uint32_t pirq,
-uint32_t gflags)
+uint64_t addr,
+uint32_t data)
 {
 int rc;
 struct xen_domctl_bind_pt_irq *bind;
@@ -1775,8 +1777,8 @@ int xc_domain_unbind_msi_irq(
 bind = &(domctl.u.bind_pt_irq);
 bind->irq_type = PT_IRQ_TYPE_MSI;
 bind->machine_irq = pirq;
-bind->u.msi.gvec = gvec;
-bind->u.msi.gflags = gflags;
+bind->u.msi.addr = addr;
+bind->u.msi.data = data;
 
 rc = do_domctl(xch, );
 return rc;
diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 7126de7..5edb0e7 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -101,12 +101,12 @@ int vmsi_deliver(
 
 void vmsi_deliver_pirq(struct domain *d, const struct hvm_pirq_dpci *pirq_dpci)
 {
-uint32_t flags = pirq_dpci->gmsi.gflags;
-int vector = pirq_dpci->gmsi.gvec;
-uint8_t dest = (uint8_t)flags;
-bool dest_mode = flags & XEN_DOMCTL_VMSI_X86_DM_MASK;
-uint8_t delivery_mode = MASK_EXTR(flags, XEN_DOMCTL_VMSI_X86_DELIV_MASK);
-bool trig_mode = flags & XEN_DOMCTL_VMSI_X86_TRIG_MASK;
+uint8_t vector = pirq_dpci->gmsi.data & MSI_DATA_VECTOR_MASK;
+uint8_t dest = MASK_EXTR(pirq_dpci->gmsi.addr, MSI_ADDR_DEST_ID_MASK);
+bool dest_mode = pirq_dpci->gmsi.addr & MSI_ADDR_DESTMODE_MASK;
+uint8_t delivery_mode = MASK_EXTR(pirq_dpci->gmsi.data,
+  MSI_DATA_DELIVERY_MODE_MASK);
+bool trig_mode = pirq_dpci->gmsi.data & MSI_DATA_TRIGGER_MASK;
 
 HVM_DBG_LOG(DBG_LEVEL_IOAPIC,
 "msi: dest=%x dest_mode=%x delivery_mode=%x "
diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index 8f16e6c..d8c66bf 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -339,19 +339,17 @@ int pt_irq_create_bind(
 {
 case PT_IRQ_TYPE_MSI:
 {
-uint8_t dest, delivery_mode;
+uint8_t dest, delivery_mode, gvec;
 bool dest_mode;
 int dest_vcpu_id;
 const struct vcpu *vcpu;
-uint32_t gflags = pt_irq_bind->u.msi.gflags &
-  ~XEN_DOMCTL_VMSI_X86_UNMASKED;
 
 if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
 {
 pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED | HVM_IRQ_DPCI_MACH_MSI |
 

[Xen-devel] [PATCH v4 22/28] x86/vmsi: Hook delivering remapping format msi to guest and handling eoi

2017-11-16 Thread Chao Gao
When delivering guest msi, firstly, the format of the msi is determined
by the 'check_irq_remmapping' method of viommu. Then, msi of
non-remapping format is delivered as normal and remapping format msi is
handled by viommu. When handling eoi, the interrupt attributes (vector,
affinity) are used to search the physical irq. It is clear that for
remapping format msi, the interrupt attributs should be decodes from
IRTE.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
 xen/arch/x86/hvm/irq.c   |  6 ++
 xen/arch/x86/hvm/vmsi.c  | 33 +
 xen/drivers/passthrough/io.c | 35 +++
 3 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c
index e425df9..b561480 100644
--- a/xen/arch/x86/hvm/irq.c
+++ b/xen/arch/x86/hvm/irq.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -339,6 +340,11 @@ int hvm_inject_msi(struct domain *d, uint64_t addr, 
uint32_t data)
 uint8_t trig_mode = (data & MSI_DATA_TRIGGER_MASK)
 >> MSI_DATA_TRIGGER_SHIFT;
 uint8_t vector = data & MSI_DATA_VECTOR_MASK;
+struct arch_irq_remapping_request request;
+
+irq_request_msi_fill(, addr, data);
+if ( viommu_check_irq_remapping(d, ) )
+return viommu_handle_irq_request(d, );
 
 if ( !vector )
 {
diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 5edb0e7..9dc5631 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -101,21 +102,29 @@ int vmsi_deliver(
 
 void vmsi_deliver_pirq(struct domain *d, const struct hvm_pirq_dpci *pirq_dpci)
 {
-uint8_t vector = pirq_dpci->gmsi.data & MSI_DATA_VECTOR_MASK;
-uint8_t dest = MASK_EXTR(pirq_dpci->gmsi.addr, MSI_ADDR_DEST_ID_MASK);
-bool dest_mode = pirq_dpci->gmsi.addr & MSI_ADDR_DESTMODE_MASK;
-uint8_t delivery_mode = MASK_EXTR(pirq_dpci->gmsi.data,
-  MSI_DATA_DELIVERY_MODE_MASK);
-bool trig_mode = pirq_dpci->gmsi.data & MSI_DATA_TRIGGER_MASK;
-
-HVM_DBG_LOG(DBG_LEVEL_IOAPIC,
-"msi: dest=%x dest_mode=%x delivery_mode=%x "
-"vector=%x trig_mode=%x\n",
-dest, dest_mode, delivery_mode, vector, trig_mode);
+struct arch_irq_remapping_request request;
 
 ASSERT(pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI);
 
-vmsi_deliver(d, vector, dest, dest_mode, delivery_mode, trig_mode);
+irq_request_msi_fill(, pirq_dpci->gmsi.addr, pirq_dpci->gmsi.data);
+if ( viommu_check_irq_remapping(d, ) )
+viommu_handle_irq_request(d, );
+else
+{
+uint8_t vector = pirq_dpci->gmsi.data & MSI_DATA_VECTOR_MASK;
+uint8_t dest = MASK_EXTR(pirq_dpci->gmsi.addr, MSI_ADDR_DEST_ID_MASK);
+bool dest_mode = pirq_dpci->gmsi.addr & MSI_ADDR_DESTMODE_MASK;
+uint8_t delivery_mode = MASK_EXTR(pirq_dpci->gmsi.data,
+  MSI_DATA_DELIVERY_MODE_MASK);
+bool trig_mode = pirq_dpci->gmsi.data & MSI_DATA_TRIGGER_MASK;
+
+HVM_DBG_LOG(DBG_LEVEL_IOAPIC,
+"msi: dest=%x dest_mode=%x delivery_mode=%x "
+"vector=%x trig_mode=%x\n",
+dest, dest_mode, delivery_mode, vector, trig_mode);
+
+vmsi_deliver(d, vector, dest, dest_mode, delivery_mode, trig_mode);
+}
 }
 
 /* Return value, -1 : multi-dests, non-negative value: dest_vcpu_id */
diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index 9198ef5..34a3cf1 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -872,16 +872,35 @@ static void __msi_pirq_eoi(struct hvm_pirq_dpci 
*pirq_dpci)
 static int _hvm_dpci_msi_eoi(struct domain *d,
  struct hvm_pirq_dpci *pirq_dpci, void *arg)
 {
-int vector = (long)arg;
-
-if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) &&
- (pirq_dpci->gmsi.gvec == vector) )
+if ( pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI )
 {
-uint32_t dest = MASK_EXTR(pirq_dpci->gmsi.addr, MSI_ADDR_DEST_ID_MASK);
-bool dest_mode = pirq_dpci->gmsi.addr & MSI_ADDR_DESTMODE_MASK;
+uint8_t vector, vector_target = (long)arg;
+uint32_t dest;
+bool dm;
+struct arch_irq_remapping_request request;
+
+irq_request_msi_fill(, pirq_dpci->gmsi.addr,
+ pirq_dpci->gmsi.data);
+if ( viommu_check_irq_remapping(d, ) )
+{
+struct arch_irq_remapping_info info;
+
+if ( viommu_get_irq_info(d, , ) )
+return 0;
+
+vector = info.

[Xen-devel] [PATCH v4 23/28] tools/libacpi: Add DMA remapping reporting (DMAR) ACPI table structures

2017-11-16 Thread Chao Gao
Add dmar table structure according Chapter 8 "BIOS Considerations" of
VTd spec Rev. 2.4.

VTd 
spec:http://www.intel.com/content/dam/www/public/us/en/documents/product-specifications/vt-directed-io-spec.pdf

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
 tools/libacpi/acpi2_0.h | 61 +
 1 file changed, 61 insertions(+)

diff --git a/tools/libacpi/acpi2_0.h b/tools/libacpi/acpi2_0.h
index 2619ba3..6081417 100644
--- a/tools/libacpi/acpi2_0.h
+++ b/tools/libacpi/acpi2_0.h
@@ -422,6 +422,65 @@ struct acpi_20_slit {
 };
 
 /*
+ * DMA Remapping Table header definition (DMAR)
+ */
+
+/*
+ * DMAR Flags.
+ */
+#define ACPI_DMAR_INTR_REMAP(1 << 0)
+#define ACPI_DMAR_X2APIC_OPT_OUT(1 << 1)
+
+struct acpi_dmar {
+struct acpi_header header;
+uint8_t host_address_width;
+uint8_t flags;
+uint8_t reserved[10];
+};
+
+/*
+ * Device Scope Types
+ */
+#define ACPI_DMAR_DEV_SCOPE_PCI_ENDPOINT0x01
+#define ACPI_DMAR_DEV_SCOPE_PCI_SUB_HRCHY   0x02
+#define ACPI_DMAR_DEV_SCOPE_IOAPIC  0x03
+#define ACPI_DMAR_DEV_SCOPE_HPET0x04
+#define ACPI_DMAR_DEV_SCOPE_ACPI_NS_DEV 0x05
+
+struct dmar_device_scope {
+uint8_t type;
+uint8_t length;
+uint8_t reserved[2];
+uint8_t enumeration_id;
+uint8_t bus;
+uint16_t path[0];
+};
+
+/*
+ * DMA Remapping Hardware Unit Types
+ */
+#define ACPI_DMAR_TYPE_DRHD 0x00
+#define ACPI_DMAR_TYPE_RMRR 0x01
+#define ACPI_DMAR_TYPE_ATSR 0x02
+#define ACPI_DMAR_TYPE_RHSA 0x03
+#define ACPI_DMAR_TYPE_ANDD 0x04
+
+/*
+ * DMA Remapping Hardware Unit Flags. All other bits are reserved and must be 
0.
+ */
+#define ACPI_DMAR_INCLUDE_PCI_ALL   (1 << 0)
+
+struct acpi_dmar_hardware_unit {
+uint16_t type;
+uint16_t length;
+uint8_t flags;
+uint8_t reserved;
+uint16_t pci_segment;
+uint64_t base_address;
+struct dmar_device_scope scope[0];
+};
+
+/*
  * Table Signatures.
  */
 #define ACPI_2_0_RSDP_SIGNATURE ASCII64('R','S','D',' ','P','T','R',' ')
@@ -435,6 +494,7 @@ struct acpi_20_slit {
 #define ACPI_2_0_WAET_SIGNATURE ASCII32('W','A','E','T')
 #define ACPI_2_0_SRAT_SIGNATURE ASCII32('S','R','A','T')
 #define ACPI_2_0_SLIT_SIGNATURE ASCII32('S','L','I','T')
+#define ACPI_2_0_DMAR_SIGNATURE ASCII32('D','M','A','R')
 
 /*
  * Table revision numbers.
@@ -449,6 +509,7 @@ struct acpi_20_slit {
 #define ACPI_1_0_FADT_REVISION 0x01
 #define ACPI_2_0_SRAT_REVISION 0x01
 #define ACPI_2_0_SLIT_REVISION 0x01
+#define ACPI_2_0_DMAR_REVISION 0x01
 
 #pragma pack ()
 
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 15/28] x86/vvtd: Enable Queued Invalidation through GCMD

2017-11-16 Thread Chao Gao
Software writes to QIE field of GCMD to enable or disable queued
invalidations. This patch emulates QIE field of GCMD.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
 xen/drivers/passthrough/vtd/iommu.h |  3 ++-
 xen/drivers/passthrough/vtd/vvtd.c  | 18 ++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/xen/drivers/passthrough/vtd/iommu.h 
b/xen/drivers/passthrough/vtd/iommu.h
index dc2df75..b71dab8 100644
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -160,7 +160,8 @@
 #define DMA_GSTS_FLS(((u64)1) << 29)
 #define DMA_GSTS_AFLS   (((u64)1) << 28)
 #define DMA_GSTS_WBFS   (((u64)1) << 27)
-#define DMA_GSTS_QIES   (((u64)1) <<26)
+#define DMA_GSTS_QIES_SHIFT 26
+#define DMA_GSTS_QIES   (((u64)1) << DMA_GSTS_QIES_SHIFT)
 #define DMA_GSTS_IRES_SHIFT 25
 #define DMA_GSTS_IRES   (((u64)1) << DMA_GSTS_IRES_SHIFT)
 #define DMA_GSTS_SIRTPS_SHIFT   24
diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
b/xen/drivers/passthrough/vtd/vvtd.c
index 83805d1..a2fa64a 100644
--- a/xen/drivers/passthrough/vtd/vvtd.c
+++ b/xen/drivers/passthrough/vtd/vvtd.c
@@ -539,6 +539,20 @@ static void write_gcmd_ire(struct vvtd *vvtd, uint32_t val)
 (vvtd, DMAR_GSTS_REG, DMA_GSTS_IRES_SHIFT);
 }
 
+static void write_gcmd_qie(struct vvtd *vvtd, uint32_t val)
+{
+bool set = val & DMA_GCMD_QIE;
+
+vvtd_info("%sable Queue Invalidation\n", set ? "En" : "Dis");
+
+if ( set )
+vvtd_set_reg_quad(vvtd, DMAR_IQH_REG, 0);
+
+(set ? vvtd_set_bit : vvtd_clear_bit)
+(vvtd, DMAR_GSTS_REG, DMA_GSTS_QIES_SHIFT);
+
+}
+
 static void write_gcmd_sirtp(struct vvtd *vvtd, uint32_t val)
 {
 uint64_t irta = vvtd_get_reg_quad(vvtd, DMAR_IRTA_REG);
@@ -598,6 +612,10 @@ static void vvtd_write_gcmd(struct vvtd *vvtd, uint32_t 
val)
 write_gcmd_sirtp(vvtd, val);
 if ( changed & DMA_GCMD_IRE )
 write_gcmd_ire(vvtd, val);
+if ( changed & DMA_GCMD_QIE )
+write_gcmd_qie(vvtd, val);
+if ( changed & ~(DMA_GCMD_SIRTP | DMA_GCMD_IRE | DMA_GCMD_QIE) )
+vvtd_info("Only SIRTP, IRE, QIE in GCMD are handled");
 }
 
 static int vvtd_in_range(struct vcpu *v, unsigned long addr)
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 12/28] x86/vvtd: decode interrupt attribute from IRTE

2017-11-16 Thread Chao Gao
Without interrupt remapping, interrupt attributes can be extracted from
msi message or IOAPIC RTE. However, with interrupt remapping enabled,
the attributes are enclosed in the associated IRTE. This callback is
for cases in which the caller wants to acquire interrupt attributes, for
example:
1. vioapic_get_vector(). With vIOMMU, the RTE may don't contain vector.
2. perform EOI which is always based on the interrupt vector.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
v3:
 - add example cases in which we will use this function.
---
 xen/drivers/passthrough/vtd/vvtd.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
b/xen/drivers/passthrough/vtd/vvtd.c
index 927e715..9890cc2 100644
--- a/xen/drivers/passthrough/vtd/vvtd.c
+++ b/xen/drivers/passthrough/vtd/vvtd.c
@@ -541,6 +541,30 @@ static int vvtd_handle_irq_request(const struct domain *d,
 return ret;
 }
 
+static int vvtd_get_irq_info(const struct domain *d,
+ const struct arch_irq_remapping_request *irq,
+ struct arch_irq_remapping_info *info)
+{
+int ret;
+struct iremap_entry irte;
+struct vvtd *vvtd = domain_vvtd(d);
+
+if ( !vvtd )
+return -ENODEV;
+
+ret = vvtd_get_entry(vvtd, irq, );
+/* not in an interrupt delivery, don't report faults to guest */
+if ( ret )
+return ret;
+
+info->vector = irte.remap.vector;
+info->dest = irte_dest(vvtd, irte.remap.dst);
+info->dest_mode = irte.remap.dm;
+info->delivery_mode = irte.remap.dlm;
+
+return 0;
+}
+
 static void vvtd_reset(struct vvtd *vvtd)
 {
 uint64_t cap = cap_set_num_fault_regs(VVTD_FRCD_NUM)
@@ -603,6 +627,7 @@ static const struct viommu_ops vvtd_hvm_vmx_ops = {
 .create = vvtd_create,
 .destroy = vvtd_destroy,
 .handle_irq_request = vvtd_handle_irq_request,
+.get_irq_info = vvtd_get_irq_info,
 };
 
 REGISTER_VIOMMU(vvtd_hvm_vmx_ops);
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 25/28] tools/libxl: Add an user configurable parameter to control vIOMMU attributes

2017-11-16 Thread Chao Gao
A field, viommu_info, is added to struct libxl_domain_build_info. Several
attributes can be specified by guest config file for virtual IOMMU. These
attributes are used for DMAR construction and vIOMMU creation.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>

---
v4:
 - Move VTD_BASE_ADDRESS's definition to libxl_arch.h

v3:
 - allow an array of viommu rather than only one viommu to present to guest.
 During domain building, an error would be raised for
 multiple viommus case since we haven't implemented this yet.
 - provide a libxl__viommu_set_default() for viommu
---
 docs/man/xl.cfg.pod.5.in| 27 
 tools/libxl/libxl_arch.h|  1 +
 tools/libxl/libxl_create.c  | 47 ++
 tools/libxl/libxl_types.idl | 12 +++
 tools/xl/xl_parse.c | 50 -
 5 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/docs/man/xl.cfg.pod.5.in b/docs/man/xl.cfg.pod.5.in
index b7b91d8..2a48cb8 100644
--- a/docs/man/xl.cfg.pod.5.in
+++ b/docs/man/xl.cfg.pod.5.in
@@ -1803,6 +1803,33 @@ 
L<http://www.microsoft.com/en-us/download/details.aspx?id=30707>
 
 =back 
 
+=item B

[Xen-devel] [PATCH v4 21/28] vvtd: update hvm_gmsi_info when binding guest msi with pirq or

2017-11-16 Thread Chao Gao
... handlding guest's invalidation request.

To support pirq migration optimization and using VT-d posted interrupt to
inject msi from assigned devices, each time guest programs msi information
(affinity, vector), the struct hvm_gmsi_info should be updated accordingly.
But after introducing vvtd, guest only needs to update an IRTE, which is in
guest memory, to program msi information.  vvtd doesn't trap r/w to the memory
range. Instead, it traps the queue invalidation, which is a method used to
notify VT-d hardware that an IRTE has changed.

This patch updates hvm_gmsi_info structure and programs physical IRTEs to use
VT-d posted interrupt if possible when binding guest msi with pirq or handling
guest's invalidation request. For the latter, all physical interrupts bound
with the domain are gone through to find the ones matching with the IRTE.

Notes: calling vvtd_process_iq() in vvtd_read() rather than in
vvtd_handle_irq_request() is to avoid ABBA deadlock of d->event_lock and
vvtd->ie_lock.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
v4:
 - new
---
 xen/arch/x86/hvm/hvm.c |  2 +-
 xen/drivers/passthrough/io.c   | 89 --
 xen/drivers/passthrough/vtd/vvtd.c | 70 --
 xen/include/asm-x86/hvm/hvm.h  |  2 +
 xen/include/asm-x86/hvm/irq.h  |  1 +
 xen/include/asm-x86/viommu.h   | 11 +
 6 files changed, 147 insertions(+), 28 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 964418a..d2c1372 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -462,7 +462,7 @@ void hvm_migrate_timers(struct vcpu *v)
 pt_migrate(v);
 }
 
-static int hvm_migrate_pirq(struct domain *d, struct hvm_pirq_dpci *pirq_dpci,
+int hvm_migrate_pirq(struct domain *d, struct hvm_pirq_dpci *pirq_dpci,
 void *arg)
 {
 struct vcpu *v = arg;
diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index d8c66bf..9198ef5 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -275,6 +276,61 @@ static struct vcpu *vector_hashing_dest(const struct 
domain *d,
 return dest;
 }
 
+void pt_update_gmsi(struct domain *d, struct hvm_pirq_dpci *pirq_dpci)
+{
+uint8_t dest, delivery_mode;
+bool dest_mode;
+int dest_vcpu_id;
+const struct vcpu *vcpu;
+struct arch_irq_remapping_request request;
+struct arch_irq_remapping_info remap_info;
+
+ASSERT(spin_is_locked(>event_lock));
+
+/* Calculate dest_vcpu_id for MSI-type pirq migration. */
+irq_request_msi_fill(, pirq_dpci->gmsi.addr, pirq_dpci->gmsi.data);
+if ( viommu_check_irq_remapping(d, ) )
+{
+/* An error in IRTE, don't perform the optimization */
+if ( viommu_get_irq_info(d, , _info) )
+{
+pirq_dpci->gmsi.posted = false;
+pirq_dpci->gmsi.dest_vcpu_id = -1;
+pirq_dpci->gmsi.gvec = 0;
+return;
+}
+
+dest = remap_info.dest;
+dest_mode = remap_info.dest_mode;
+delivery_mode = remap_info.delivery_mode;
+pirq_dpci->gmsi.gvec = remap_info.vector;
+}
+else
+{
+dest = MASK_EXTR(pirq_dpci->gmsi.addr, MSI_ADDR_DEST_ID_MASK);
+dest_mode = pirq_dpci->gmsi.addr & MSI_ADDR_DESTMODE_MASK;
+delivery_mode = MASK_EXTR(pirq_dpci->gmsi.data,
+  MSI_DATA_DELIVERY_MODE_MASK);
+pirq_dpci->gmsi.gvec = pirq_dpci->gmsi.data & MSI_DATA_VECTOR_MASK;
+}
+
+dest_vcpu_id = hvm_girq_dest_2_vcpu_id(d, dest, dest_mode);
+pirq_dpci->gmsi.dest_vcpu_id = dest_vcpu_id;
+
+pirq_dpci->gmsi.posted = false;
+vcpu = (dest_vcpu_id >= 0) ? d->vcpu[dest_vcpu_id] : NULL;
+if ( iommu_intpost )
+{
+if ( delivery_mode == dest_LowestPrio )
+vcpu = vector_hashing_dest(d, dest, dest_mode, 
pirq_dpci->gmsi.gvec);
+if ( vcpu )
+{
+pirq_dpci->gmsi.posted = true;
+pirq_dpci->gmsi.dest_vcpu_id = vcpu->vcpu_id;
+}
+}
+}
+
 int pt_irq_create_bind(
 struct domain *d, const struct xen_domctl_bind_pt_irq *pt_irq_bind)
 {
@@ -339,9 +395,6 @@ int pt_irq_create_bind(
 {
 case PT_IRQ_TYPE_MSI:
 {
-uint8_t dest, delivery_mode, gvec;
-bool dest_mode;
-int dest_vcpu_id;
 const struct vcpu *vcpu;
 
 if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
@@ -411,35 +464,23 @@ int pt_irq_create_bind(
 pirq_dpci->gmsi.addr = pt_irq_bind->u.msi.addr;
 }
 }
-/* Calculate dest_vcpu_id for MSI-type pirq migration. */
-dest = MASK_EXTR(pirq_dpci->gmsi.addr, MSI_ADDR_DEST_ID_MASK);
-d

[Xen-devel] [PATCH v4 19/28] x86/vioapic: extend vioapic_get_vector() to support remapping format RTE

2017-11-16 Thread Chao Gao
When IOAPIC RTE is in remapping format, it doesn't contain the vector of
interrupt. For this case, the RTE contains an index of interrupt remapping
table where the vector of interrupt is stored. This patchs gets the vector
through a vIOMMU interface.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
 xen/arch/x86/hvm/vioapic.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/xen/arch/x86/hvm/vioapic.c b/xen/arch/x86/hvm/vioapic.c
index 0f20e3f..8b34b21 100644
--- a/xen/arch/x86/hvm/vioapic.c
+++ b/xen/arch/x86/hvm/vioapic.c
@@ -560,11 +560,23 @@ int vioapic_get_vector(const struct domain *d, unsigned 
int gsi)
 {
 unsigned int pin;
 const struct hvm_vioapic *vioapic = gsi_vioapic(d, gsi, );
+struct arch_irq_remapping_request request;
 
 if ( !vioapic )
 return -EINVAL;
 
-return vioapic->redirtbl[pin].fields.vector;
+irq_request_ioapic_fill(, vioapic->id, 
vioapic->redirtbl[pin].bits);
+if ( viommu_check_irq_remapping(vioapic->domain, ) )
+{
+struct arch_irq_remapping_info info;
+
+return unlikely(viommu_get_irq_info(vioapic->domain, , ))
+   ? : info.vector;
+}
+else
+{
+return vioapic->redirtbl[pin].fields.vector;
+}
 }
 
 int vioapic_get_trigger_mode(const struct domain *d, unsigned int gsi)
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 16/28] x86/vvtd: Add queued invalidation (QI) support

2017-11-16 Thread Chao Gao
Queued Invalidation Interface is an expanded invalidation interface with
extended capabilities. Hardware implementations report support for queued
invalidation interface through the Extended Capability Register. The queued
invalidation interface uses an Invalidation Queue (IQ), which is a circular
buffer in system memory. Software submits commands by writing Invalidation
Descriptors to the IQ.

In this patch, a new function viommu_process_iq() is used for emulating how
hardware handles invalidation requests through QI.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>

---
v4:
 - Introduce a lock to protect invalidation related registers.
---
 xen/drivers/passthrough/vtd/iommu.h |  24 +++-
 xen/drivers/passthrough/vtd/vvtd.c  | 271 +++-
 2 files changed, 293 insertions(+), 2 deletions(-)

diff --git a/xen/drivers/passthrough/vtd/iommu.h 
b/xen/drivers/passthrough/vtd/iommu.h
index b71dab8..de9188b 100644
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -47,7 +47,12 @@
 #define DMAR_IQH_REG0x80 /* invalidation queue head */
 #define DMAR_IQT_REG0x88 /* invalidation queue tail */
 #define DMAR_IQA_REG0x90 /* invalidation queue addr */
+#define DMAR_IQUA_REG   0x94 /* invalidation queue upper addr */
+#define DMAR_ICS_REG0x9c /* invalidation completion status */
 #define DMAR_IECTL_REG  0xa0 /* invalidation event control register */
+#define DMAR_IEDATA_REG 0xa4 /* invalidation event data register */
+#define DMAR_IEADDR_REG 0xa8 /* invalidation event address register */
+#define DMAR_IEUADDR_REG0xac /* upper address register */
 #define DMAR_IRTA_REG   0xb8 /* base address of intr remap table */
 #define DMAR_IRTUA_REG  0xbc /* upper address of intr remap table */
 
@@ -175,6 +180,21 @@
 #define DMA_IRTA_S(val) (val & 0xf)
 #define DMA_IRTA_SIZE(val)  (1UL << (DMA_IRTA_S(val) + 1))
 
+/* IQA_REG */
+#define DMA_IQA_ADDR(val)   (val & ~0xfffULL)
+#define DMA_IQA_QS(val) (val & 0x7)
+#define DMA_IQA_RSVD0xff8ULL
+
+/* IECTL_REG */
+#define DMA_IECTL_IM_SHIFT 31
+#define DMA_IECTL_IM(1U << DMA_IECTL_IM_SHIFT)
+#define DMA_IECTL_IP_SHIFT 30
+#define DMA_IECTL_IP(1U << DMA_IECTL_IP_SHIFT)
+
+/* ICS_REG */
+#define DMA_ICS_IWC_SHIFT   0
+#define DMA_ICS_IWC (1U << DMA_ICS_IWC_SHIFT)
+
 /* PMEN_REG */
 #define DMA_PMEN_EPM(((u32)1) << 31)
 #define DMA_PMEN_PRS(((u32)1) << 0)
@@ -205,13 +225,14 @@
 /* FSTS_REG */
 #define DMA_FSTS_PFO_SHIFT  0
 #define DMA_FSTS_PPF_SHIFT  1
+#define DMA_FSTS_IQE_SHIFT  4
 #define DMA_FSTS_PRO_SHIFT  7
 
 #define DMA_FSTS_PFO((uint32_t)1 << DMA_FSTS_PFO_SHIFT)
 #define DMA_FSTS_PPF((uint32_t)1 << DMA_FSTS_PPF_SHIFT)
 #define DMA_FSTS_AFO((uint32_t)1 << 2)
 #define DMA_FSTS_APF((uint32_t)1 << 3)
-#define DMA_FSTS_IQE((uint32_t)1 << 4)
+#define DMA_FSTS_IQE((uint32_t)1 << DMA_FSTS_IQE_SHIFT)
 #define DMA_FSTS_ICE((uint32_t)1 << 5)
 #define DMA_FSTS_ITE((uint32_t)1 << 6)
 #define DMA_FSTS_PRO((uint32_t)1 << DMA_FSTS_PRO_SHIFT)
@@ -555,6 +576,7 @@ struct qinval_entry {
 
 /* Queue invalidation head/tail shift */
 #define QINVAL_INDEX_SHIFT 4
+#define QINVAL_INDEX_MASK  0x7fff0ULL
 
 #define qinval_present(v) ((v).lo & 1)
 #define qinval_fault_disable(v) (((v).lo >> 1) & 1)
diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
b/xen/drivers/passthrough/vtd/vvtd.c
index a2fa64a..81170ec 100644
--- a/xen/drivers/passthrough/vtd/vvtd.c
+++ b/xen/drivers/passthrough/vtd/vvtd.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "iommu.h"
@@ -68,6 +69,9 @@ struct vvtd {
 
 struct hvm_hw_vvtd hw;
 void *irt_base;
+void *inv_queue_base;
+/* This lock protects invalidation related registers */
+spinlock_t ie_lock;
 };
 
 /* Setting viommu_verbose enables debugging messages of vIOMMU */
@@ -284,6 +288,12 @@ static void vvtd_notify_fault(const struct vvtd *vvtd)
 vvtd_get_reg(vvtd, DMAR_FEDATA_REG));
 }
 
+static void vvtd_notify_inv_completion(const struct vvtd *vvtd)
+{
+vvtd_generate_interrupt(vvtd, vvtd_get_reg_quad(vvtd, DMAR_IEADDR_REG),
+vvtd_get_reg(vvtd, DMAR_IEDATA_REG));
+}
+
 /* Computing the IRTE index for a given interrupt request. When success, return
  * 0 and set index to reference the corresponding IRTE. Otherwise, return < 0,
  * i.e. -1 when the irq request isn't an remapping format.
@@ -478,6 +488,189 @@ static int vvtd_record_fault(struct vvtd *vvtd,
 return X86EMUL_OKAY;
 }
 
+/*
+ * Process an invalidation descriptor. Currently, only two types descriptors,

[Xen-devel] [PATCH v4 17/28] x86/vvtd: save and restore emulated VT-d

2017-11-16 Thread Chao Gao
Provide a save-restore pair to save/restore registers and non-register
status.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
v3:
 - use one entry to save both vvtd registers and other intermediate
 state
---
 xen/drivers/passthrough/vtd/vvtd.c | 57 +++---
 xen/include/public/arch-x86/hvm/save.h | 18 ++-
 2 files changed, 56 insertions(+), 19 deletions(-)

diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
b/xen/drivers/passthrough/vtd/vvtd.c
index 81170ec..f6bde69 100644
--- a/xen/drivers/passthrough/vtd/vvtd.c
+++ b/xen/drivers/passthrough/vtd/vvtd.c
@@ -27,8 +27,10 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
+#include 
 
 #include "iommu.h"
 #include "vtd.h"
@@ -38,20 +40,6 @@
 
 #define VVTD_FRCD_NUM   1ULL
 #define VVTD_FRCD_START (DMAR_IRTA_REG + 8)
-#define VVTD_FRCD_END   (VVTD_FRCD_START + VVTD_FRCD_NUM * 16)
-#define VVTD_MAX_OFFSET VVTD_FRCD_END
-
-struct hvm_hw_vvtd {
-bool eim_enabled;
-bool intremap_enabled;
-uint32_t fault_index;
-
-/* Interrupt remapping table base gfn and the max of entries */
-uint16_t irt_max_entry;
-gfn_t irt;
-
-uint32_t regs[VVTD_MAX_OFFSET/sizeof(uint32_t)];
-};
 
 struct vvtd {
 /* Base address of remapping hardware register-set */
@@ -776,7 +764,7 @@ static void write_gcmd_sirtp(struct vvtd *vvtd, uint32_t 
val)
 if ( vvtd->hw.intremap_enabled )
 vvtd_info("Update Interrupt Remapping Table when active\n");
 
-if ( gfn_x(vvtd->hw.irt) != PFN_DOWN(DMA_IRTA_ADDR(irta)) ||
+if ( vvtd->hw.irt != PFN_DOWN(DMA_IRTA_ADDR(irta)) ||
  vvtd->hw.irt_max_entry != DMA_IRTA_SIZE(irta) )
 {
 if ( vvtd->irt_base )
@@ -786,14 +774,14 @@ static void write_gcmd_sirtp(struct vvtd *vvtd, uint32_t 
val)
  sizeof(struct iremap_entry)));
 vvtd->irt_base = NULL;
 }
-vvtd->hw.irt = _gfn(PFN_DOWN(DMA_IRTA_ADDR(irta)));
+vvtd->hw.irt = PFN_DOWN(DMA_IRTA_ADDR(irta));
 vvtd->hw.irt_max_entry = DMA_IRTA_SIZE(irta);
 vvtd->hw.eim_enabled = !!(irta & IRTA_EIME);
 vvtd_info("Update IR info (addr=%lx eim=%d size=%d)\n",
-  gfn_x(vvtd->hw.irt), vvtd->hw.eim_enabled,
+  vvtd->hw.irt, vvtd->hw.eim_enabled,
   vvtd->hw.irt_max_entry);
 
-vvtd->irt_base = map_guest_pages(vvtd->domain, gfn_x(vvtd->hw.irt),
+vvtd->irt_base = map_guest_pages(vvtd->domain, vvtd->hw.irt,
  PFN_UP(vvtd->hw.irt_max_entry *
 sizeof(struct iremap_entry)));
 }
@@ -1138,6 +1126,39 @@ static bool vvtd_is_remapping(const struct domain *d,
 return !irq_remapping_request_index(irq, );
 }
 
+static int vvtd_load(struct domain *d, hvm_domain_context_t *h)
+{
+struct vvtd *vvtd = domain_vvtd(d);
+uint64_t iqa;
+
+if ( !vvtd )
+return -ENODEV;
+
+if ( hvm_load_entry(VVTD, h, >hw) )
+return -EINVAL;
+
+iqa = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
+vvtd->irt_base = map_guest_pages(vvtd->domain, vvtd->hw.irt,
+ PFN_UP(vvtd->hw.irt_max_entry *
+sizeof(struct iremap_entry)));
+vvtd->inv_queue_base = map_guest_pages(vvtd->domain,
+   PFN_DOWN(DMA_IQA_ADDR(iqa)),
+   1 << DMA_IQA_QS(iqa));
+return 0;
+}
+
+static int vvtd_save(struct domain *d, hvm_domain_context_t *h)
+{
+struct vvtd *vvtd = domain_vvtd(d);
+
+if ( !vvtd )
+return 0;
+
+return hvm_save_entry(VVTD, 0, h, >hw);
+}
+
+HVM_REGISTER_SAVE_RESTORE(VVTD, vvtd_save, vvtd_load, 1, HVMSR_PER_DOM);
+
 static void vvtd_reset(struct vvtd *vvtd)
 {
 uint64_t cap = cap_set_num_fault_regs(VVTD_FRCD_NUM)
diff --git a/xen/include/public/arch-x86/hvm/save.h 
b/xen/include/public/arch-x86/hvm/save.h
index fd7bf3f..24a513b 100644
--- a/xen/include/public/arch-x86/hvm/save.h
+++ b/xen/include/public/arch-x86/hvm/save.h
@@ -639,10 +639,26 @@ struct hvm_msr {
 
 #define CPU_MSR_CODE  20
 
+#define VVTD_MAX_OFFSET 0xd0
+struct hvm_hw_vvtd
+{
+uint32_t eim_enabled : 1,
+ intremap_enabled : 1;
+uint32_t fault_index;
+
+/* Interrupt remapping table base gfn and the max of entries */
+uint32_t irt_max_entry;
+uint64_t irt;
+
+uint32_t regs[VVTD_MAX_OFFSET/sizeof(uint32_t)];
+};
+
+DECLARE_HVM_SAVE_TYPE(VVTD, 21, struct hvm_hw_vvtd);
+
 /* 
  * Largest type-code in use
  */
-#define HVM_SAVE_CODE_MAX 20
+#define HVM_SAVE_CODE_MAX 21
 
 #endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */
 
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 11/28] x86/vvtd: Process interrupt remapping request

2017-11-16 Thread Chao Gao
When a remapping interrupt request arrives, remapping hardware computes the
interrupt_index per the algorithm described in VTD spec
"Interrupt Remapping Table", interprets the IRTE and generates a remapped
interrupt request.

This patch introduces viommu_handle_irq_request() to emulate the process how
remapping hardware handles a remapping interrupt request. This patch
also introduces a counter inflight_intr, which is used to count the number
of interrupt are being handled. The reason why we should have this
counter is VT-d hardware should drain in-flight interrups before setting
flags to show that some operations are completed. These operations
include enabling interrupt remapping and performing a kind of invalidation
requests. In vvtd, we also try to drain in-flight interrupts by waiting
the inflight_intr is decreased to 0.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>

---
v4:
 - use "#define" to define interrupt remapping transition faults
 rather than using an enum
 - use switch-case rather than if-else in irq_remapping_request_index()
 and vvtd_irq_request_sanity_check()
 - introduce a counter inflight_intr

v3:
 - Encode map_guest_page()'s error into void* to avoid using another parameter
---
 xen/drivers/passthrough/vtd/iommu.h |  15 +++
 xen/drivers/passthrough/vtd/vvtd.c  | 219 
 2 files changed, 234 insertions(+)

diff --git a/xen/drivers/passthrough/vtd/iommu.h 
b/xen/drivers/passthrough/vtd/iommu.h
index 9c59aeb..82edd2a 100644
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -216,6 +216,15 @@
 #define dma_frcd_source_id(c) (c & 0x)
 #define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
 
+/* Interrupt remapping transition faults */
+#define VTD_FR_IR_REQ_RSVD  0x20
+#define VTD_FR_IR_INDEX_OVER0x21
+#define VTD_FR_IR_ENTRY_P   0x22
+#define VTD_FR_IR_ROOT_INVAL0x23
+#define VTD_FR_IR_IRTE_RSVD 0x24
+#define VTD_FR_IR_REQ_COMPAT0x25
+#define VTD_FR_IR_SID_ERR   0x26
+
 /*
  * 0: Present
  * 1-11: Reserved
@@ -356,6 +365,12 @@ struct iremap_entry {
 };
 
 /*
+ * When VT-d doesn't enable extended interrupt mode, hardware interprets
+ * 8-bits ([15:8]) of Destination-ID field in the IRTEs.
+ */
+#define IRTE_xAPIC_DEST_MASK 0xff00
+
+/*
  * Posted-interrupt descriptor address is 64 bits with 64-byte aligned, only
  * the upper 26 bits of lest significiant 32 bits is available.
  */
diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
b/xen/drivers/passthrough/vtd/vvtd.c
index 06e522a..927e715 100644
--- a/xen/drivers/passthrough/vtd/vvtd.c
+++ b/xen/drivers/passthrough/vtd/vvtd.c
@@ -22,11 +22,15 @@
 #include 
 #include 
 #include 
+#include 
 #include 
+#include 
+#include 
 #include 
 #include 
 
 #include "iommu.h"
+#include "vtd.h"
 
 /* Supported capabilities by vvtd */
 #define VVTD_MAX_CAPS VIOMMU_CAP_IRQ_REMAPPING
@@ -52,6 +56,8 @@ struct vvtd {
 uint64_t base_addr;
 /* Point back to the owner domain */
 struct domain *domain;
+/* # of in-flight interrupts */
+atomic_t inflight_intr;
 
 struct hvm_hw_vvtd hw;
 void *irt_base;
@@ -181,6 +187,109 @@ static void unmap_guest_pages(void *va, uint32_t nr)
 put_page_and_type(mfn_to_page(mfn[i]));
 }
 
+static int vvtd_delivery(struct domain *d, uint8_t vector,
+ uint32_t dest, bool dest_mode,
+ uint8_t delivery_mode, uint8_t trig_mode)
+{
+struct vlapic *target;
+struct vcpu *v;
+
+switch ( delivery_mode )
+{
+case dest_LowestPrio:
+target = vlapic_lowest_prio(d, NULL, 0, dest, dest_mode);
+if ( target != NULL )
+{
+vvtd_debug("d%d: dest=v%d dlm=%x vector=%d trig_mode=%d\n",
+   vlapic_domain(target)->domain_id,
+   vlapic_vcpu(target)->vcpu_id,
+   delivery_mode, vector, trig_mode);
+vlapic_set_irq(target, vector, trig_mode);
+break;
+}
+vvtd_debug("d%d: null round robin: vector=%02x\n",
+   d->domain_id, vector);
+break;
+
+case dest_Fixed:
+for_each_vcpu ( d, v )
+if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) )
+{
+vvtd_debug("d%d: dest=v%d dlm=%x vector=%d trig_mode=%d\n",
+   v->domain->domain_id, v->vcpu_id,
+   delivery_mode, vector, trig_mode);
+vlapic_set_irq(vcpu_vlapic(v), vector, trig_mode);
+}
+break;
+
+case dest_NMI:
+for_each_vcpu ( d, v )
+if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) &&
+ !test_and_set_bool(v->nmi_pending) )
+vcpu_kick(v);
+b

[Xen-devel] [PATCH v4 06/28] vtd: clean-up and preparation for vvtd

2017-11-16 Thread Chao Gao
This patch contains following changes:
- align register definitions
- use MASK_EXTR to define some macros about extended capabilies
rather than open-coding the masks
- define fields of FECTL and FESTS as uint32_t rather than u64 since
FECTL and FESTS are 32 bit registers.

No functional changes.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>

---
v4:
 - Only fix the alignment and defer introducing new definition to when
 they are needed
 (Suggested-by Roger Pau Monné)
 - remove parts of open-coded masks
v3:
 - new
---
 xen/drivers/passthrough/vtd/iommu.h | 86 +
 1 file changed, 48 insertions(+), 38 deletions(-)

diff --git a/xen/drivers/passthrough/vtd/iommu.h 
b/xen/drivers/passthrough/vtd/iommu.h
index 72c1a2e..db80b31 100644
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -26,28 +26,28 @@
  * Intel IOMMU register specification per version 1.0 public spec.
  */
 
-#defineDMAR_VER_REG0x0/* Arch version supported by this IOMMU */
-#defineDMAR_CAP_REG0x8/* Hardware supported capabilities */
-#defineDMAR_ECAP_REG0x10/* Extended capabilities supported */
-#defineDMAR_GCMD_REG0x18/* Global command register */
-#defineDMAR_GSTS_REG0x1c/* Global status register */
-#defineDMAR_RTADDR_REG0x20/* Root entry table */
-#defineDMAR_CCMD_REG0x28/* Context command reg */
-#defineDMAR_FSTS_REG0x34/* Fault Status register */
-#defineDMAR_FECTL_REG0x38/* Fault control register */
-#defineDMAR_FEDATA_REG0x3c/* Fault event interrupt data register */
-#defineDMAR_FEADDR_REG0x40/* Fault event interrupt addr register */
-#defineDMAR_FEUADDR_REG 0x44/* Upper address register */
-#defineDMAR_AFLOG_REG0x58/* Advanced Fault control */
-#defineDMAR_PMEN_REG0x64/* Enable Protected Memory Region */
-#defineDMAR_PLMBASE_REG 0x68/* PMRR Low addr */
-#defineDMAR_PLMLIMIT_REG 0x6c/* PMRR low limit */
-#defineDMAR_PHMBASE_REG 0x70/* pmrr high base addr */
-#defineDMAR_PHMLIMIT_REG 0x78/* pmrr high limit */
-#defineDMAR_IQH_REG0x80/* invalidation queue head */
-#defineDMAR_IQT_REG0x88/* invalidation queue tail */
-#defineDMAR_IQA_REG0x90/* invalidation queue addr */
-#defineDMAR_IRTA_REG   0xB8/* intr remap */
+#define DMAR_VER_REG0x0  /* Arch version supported by this IOMMU */
+#define DMAR_CAP_REG0x8  /* Hardware supported capabilities */
+#define DMAR_ECAP_REG   0x10 /* Extended capabilities supported */
+#define DMAR_GCMD_REG   0x18 /* Global command register */
+#define DMAR_GSTS_REG   0x1c /* Global status register */
+#define DMAR_RTADDR_REG 0x20 /* Root entry table */
+#define DMAR_CCMD_REG   0x28 /* Context command reg */
+#define DMAR_FSTS_REG   0x34 /* Fault Status register */
+#define DMAR_FECTL_REG  0x38 /* Fault control register */
+#define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */
+#define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */
+#define DMAR_FEUADDR_REG0x44 /* Upper address register */
+#define DMAR_AFLOG_REG  0x58 /* Advanced Fault control */
+#define DMAR_PMEN_REG   0x64 /* Enable Protected Memory Region */
+#define DMAR_PLMBASE_REG0x68 /* PMRR Low addr */
+#define DMAR_PLMLIMIT_REG   0x6c /* PMRR low limit */
+#define DMAR_PHMBASE_REG0x70 /* pmrr high base addr */
+#define DMAR_PHMLIMIT_REG   0x78 /* pmrr high limit */
+#define DMAR_IQH_REG0x80 /* invalidation queue head */
+#define DMAR_IQT_REG0x88 /* invalidation queue tail */
+#define DMAR_IQA_REG0x90 /* invalidation queue addr */
+#define DMAR_IRTA_REG   0xb8 /* intr remap */
 
 #define OFFSET_STRIDE(9)
 #define dmar_readl(dmar, reg) readl((dmar) + (reg))
@@ -93,16 +93,26 @@
  * Extended Capability Register
  */
 
+#define DMA_ECAP_SNP_CTL((uint64_t)1 << 7)
+#define DMA_ECAP_PASS_THRU  ((uint64_t)1 << 6)
+#define DMA_ECAP_CACHE_HINTS((uint64_t)1 << 5)
+#define DMA_ECAP_EIM((uint64_t)1 << 4)
+#define DMA_ECAP_INTR_REMAP ((uint64_t)1 << 3)
+#define DMA_ECAP_DEV_IOTLB  ((uint64_t)1 << 2)
+#define DMA_ECAP_QUEUED_INVAL   ((uint64_t)1 << 1)
+#define DMA_ECAP_COHERENT   ((uint64_t)1 << 0)
+
+#define ecap_snp_ctl(e) MASK_EXTR(e, DMA_ECAP_SNP_CTL)
+#define ecap_pass_thru(e)   MASK_EXTR(e, DMA_ECAP_PASS_THRU)
+#define ecap_cache_hints(e) MASK_EXTR(e, DMA_ECAP_CACHE_HINTS)
+#define ecap_eim(e) MASK_EXTR(e, DMA_ECAP_EIM)
+#define ecap_intr_remap(e)  MASK_EXTR(e, DMA_ECAP_INTR_REMAP)
+#define ecap_dev_iotlb(e)   MASK_EXTR(e, DMA_ECAP_DEV_IOTLB)
+#defin

[Xen-devel] [PATCH v4 10/28] x86/vvtd: Enable Interrupt Remapping through GCMD

2017-11-16 Thread Chao Gao
Software writes this field to enable/disable interrupt reampping. This
patch emulate IRES field of GCMD. Currently, Guest's whole IRT are
mapped to Xen permanently for the latency of delivering interrupt. And
the old mapping is undone if present when trying to set up a new one.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>
---
v4:
 - map guest's interrupt reampping table to Xen permanently rather than
 mapping one specific page on demand.
---
 xen/drivers/passthrough/vtd/iommu.h |  3 +-
 xen/drivers/passthrough/vtd/vvtd.c  | 98 +
 2 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/xen/drivers/passthrough/vtd/iommu.h 
b/xen/drivers/passthrough/vtd/iommu.h
index 8579843..9c59aeb 100644
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -161,9 +161,10 @@
 #define DMA_GSTS_AFLS   (((u64)1) << 28)
 #define DMA_GSTS_WBFS   (((u64)1) << 27)
 #define DMA_GSTS_QIES   (((u64)1) <<26)
+#define DMA_GSTS_IRES_SHIFT 25
+#define DMA_GSTS_IRES   (((u64)1) << DMA_GSTS_IRES_SHIFT)
 #define DMA_GSTS_SIRTPS_SHIFT   24
 #define DMA_GSTS_SIRTPS (((u64)1) << DMA_GSTS_SIRTPS_SHIFT)
-#define DMA_GSTS_IRES   (((u64)1) <<25)
 #define DMA_GSTS_CFIS   (((u64)1) <<23)
 
 /* IRTA_REG */
diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
b/xen/drivers/passthrough/vtd/vvtd.c
index f0476fe..06e522a 100644
--- a/xen/drivers/passthrough/vtd/vvtd.c
+++ b/xen/drivers/passthrough/vtd/vvtd.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "iommu.h"
 
@@ -37,6 +38,7 @@
 
 struct hvm_hw_vvtd {
 bool eim_enabled;
+bool intremap_enabled;
 
 /* Interrupt remapping table base gfn and the max of entries */
 uint16_t irt_max_entry;
@@ -52,6 +54,7 @@ struct vvtd {
 struct domain *domain;
 
 struct hvm_hw_vvtd hw;
+void *irt_base;
 };
 
 /* Setting viommu_verbose enables debugging messages of vIOMMU */
@@ -118,6 +121,77 @@ static void *domain_vvtd(const struct domain *d)
 return NULL;
 }
 
+static void *map_guest_pages(struct domain *d, uint64_t gfn, uint32_t nr)
+{
+mfn_t *mfn = xmalloc_array(mfn_t, nr);
+void* ret;
+int i;
+
+if ( !mfn )
+return NULL;
+
+for ( i = 0; i < nr; i++)
+{
+struct page_info *p = get_page_from_gfn(d, gfn + i, NULL, P2M_ALLOC);
+
+if ( !p || !get_page_type(p, PGT_writable_page) )
+{
+if ( p )
+put_page(p);
+goto undo;
+}
+
+mfn[i] = _mfn(page_to_mfn(p));
+}
+
+ret = vmap(mfn, nr);
+if ( ret == NULL )
+goto undo;
+xfree(mfn);
+
+return ret;
+
+ undo:
+for ( ; --i >= 0; )
+put_page_and_type(mfn_to_page(mfn_x(mfn[i])));
+xfree(mfn);
+gprintk(XENLOG_ERR, "Failed to map guest pages %lx nr %x\n", gfn, nr);
+
+return NULL;
+}
+
+static void unmap_guest_pages(void *va, uint32_t nr)
+{
+unsigned long *mfn = xmalloc_array(unsigned long, nr);
+int i;
+void *va_copy = va;
+
+if ( !mfn )
+{
+printk("%s %d: No free memory\n", __FILE__, __LINE__);
+return;
+}
+
+for ( i = 0; i < nr; i++, va += PAGE_SIZE)
+mfn[i] = domain_page_map_to_mfn(va);
+
+vunmap(va_copy);
+
+for ( i = 0; i < nr; i++)
+put_page_and_type(mfn_to_page(mfn[i]));
+}
+
+static void write_gcmd_ire(struct vvtd *vvtd, uint32_t val)
+{
+bool set = val & DMA_GCMD_IRE;
+
+vvtd_info("%sable Interrupt Remapping\n", set ? "En" : "Dis");
+
+vvtd->hw.intremap_enabled = set;
+(set ? vvtd_set_bit : vvtd_clear_bit)
+(vvtd, DMAR_GSTS_REG, DMA_GSTS_IRES_SHIFT);
+}
+
 static void write_gcmd_sirtp(struct vvtd *vvtd, uint32_t val)
 {
 uint64_t irta = vvtd_get_reg_quad(vvtd, DMAR_IRTA_REG);
@@ -131,16 +205,29 @@ static void write_gcmd_sirtp(struct vvtd *vvtd, uint32_t 
val)
  * the 'Set Interrupt Remap Table Pointer' operation.
  */
 vvtd_clear_bit(vvtd, DMAR_GSTS_REG, DMA_GSTS_SIRTPS_SHIFT);
+if ( vvtd->hw.intremap_enabled )
+vvtd_info("Update Interrupt Remapping Table when active\n");
 
 if ( gfn_x(vvtd->hw.irt) != PFN_DOWN(DMA_IRTA_ADDR(irta)) ||
  vvtd->hw.irt_max_entry != DMA_IRTA_SIZE(irta) )
 {
+if ( vvtd->irt_base )
+{
+unmap_guest_pages(vvtd->irt_base,
+  PFN_UP(vvtd->hw.irt_max_entry *
+ sizeof(struct iremap_entry)));
+vvtd->irt_base = NULL;
+}
 vvtd->hw.irt = _gfn(PFN_DOWN(DMA_IRTA_ADDR(irta)));
 vvtd->hw.irt_max_entry = DMA_IRTA_SIZE(irta);
 vvtd->hw.eim_enabled = !!(irta & IRTA_EIME);
 vvtd_info("Update IR info (addr=%lx eim=%d size=%d)\n&qu

[Xen-devel] [PATCH v4 07/28] x86/hvm: Introduce a emulated VTD for HVM

2017-11-16 Thread Chao Gao
This patch adds create/destroy function for the emulated VTD
and adapts it to the common VIOMMU abstraction.

As the Makefile is changed here, put all files in alphabetic order
by this chance.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>

---
v4:
- use REGISTER_VIOMMU
- shrink the size of hvm_hw_vvtd_regs
- make hvm_hw_vvtd_regs a field inside struct vvtd
---
 xen/drivers/passthrough/vtd/Makefile |   7 +-
 xen/drivers/passthrough/vtd/iommu.h  |   9 +++
 xen/drivers/passthrough/vtd/vvtd.c   | 150 +++
 3 files changed, 163 insertions(+), 3 deletions(-)
 create mode 100644 xen/drivers/passthrough/vtd/vvtd.c

diff --git a/xen/drivers/passthrough/vtd/Makefile 
b/xen/drivers/passthrough/vtd/Makefile
index f302653..163c7fe 100644
--- a/xen/drivers/passthrough/vtd/Makefile
+++ b/xen/drivers/passthrough/vtd/Makefile
@@ -1,8 +1,9 @@
 subdir-$(CONFIG_X86) += x86
 
-obj-y += iommu.o
 obj-y += dmar.o
-obj-y += utils.o
-obj-y += qinval.o
 obj-y += intremap.o
+obj-y += iommu.o
+obj-y += qinval.o
 obj-y += quirks.o
+obj-y += utils.o
+obj-$(CONFIG_VIOMMU) += vvtd.o
diff --git a/xen/drivers/passthrough/vtd/iommu.h 
b/xen/drivers/passthrough/vtd/iommu.h
index db80b31..f2ef3dd 100644
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -47,6 +47,7 @@
 #define DMAR_IQH_REG0x80 /* invalidation queue head */
 #define DMAR_IQT_REG0x88 /* invalidation queue tail */
 #define DMAR_IQA_REG0x90 /* invalidation queue addr */
+#define DMAR_IECTL_REG  0xa0 /* invalidation event control register */
 #define DMAR_IRTA_REG   0xb8 /* intr remap */
 
 #define OFFSET_STRIDE(9)
@@ -89,6 +90,12 @@
 #define cap_afl(c)(((c) >> 3) & 1)
 #define cap_ndoms(c)(1 << (4 + 2 * ((c) & 0x7)))
 
+#define cap_set_num_fault_regs(c)   c) - 1) & 0xff) << 40)
+#define cap_set_fault_reg_offset(c) c) / 16) & 0x3ff) << 24)
+#define cap_set_mgaw(c) c) - 1) & 0x3f) << 16)
+#define cap_set_sagaw(c)(((c) & 0x1f) << 8)
+#define cap_set_ndoms(c)((c) & 0x7)
+
 /*
  * Extended Capability Register
  */
@@ -114,6 +121,8 @@
 #define ecap_niotlb_iunits(e)e) >> 24) & 0xff) + 1)
 #define ecap_iotlb_offset(e) e) >> 8) & 0x3ff) * 16)
 
+#define ecap_set_mhmv(e) (((e) & 0xf) << 20)
+
 /* IOTLB_REG */
 #define DMA_TLB_FLUSH_GRANU_OFFSET  60
 #define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60)
diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
b/xen/drivers/passthrough/vtd/vvtd.c
new file mode 100644
index 000..9f76ccf
--- /dev/null
+++ b/xen/drivers/passthrough/vtd/vvtd.c
@@ -0,0 +1,150 @@
+/*
+ * vvtd.c
+ *
+ * virtualize VTD for HVM.
+ *
+ * Copyright (C) 2017 Chao Gao, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "iommu.h"
+
+/* Supported capabilities by vvtd */
+#define VVTD_MAX_CAPS VIOMMU_CAP_IRQ_REMAPPING
+
+#define VVTD_FRCD_NUM   1ULL
+#define VVTD_FRCD_START (DMAR_IRTA_REG + 8)
+#define VVTD_FRCD_END   (VVTD_FRCD_START + VVTD_FRCD_NUM * 16)
+#define VVTD_MAX_OFFSET VVTD_FRCD_END
+
+struct hvm_hw_vvtd {
+uint32_t regs[VVTD_MAX_OFFSET/sizeof(uint32_t)];
+};
+
+struct vvtd {
+/* Base address of remapping hardware register-set */
+uint64_t base_addr;
+/* Point back to the owner domain */
+struct domain *domain;
+
+struct hvm_hw_vvtd hw;
+};
+
+/* Setting viommu_verbose enables debugging messages of vIOMMU */
+bool __read_mostly viommu_verbose;
+boolean_runtime_param("viommu_verbose", viommu_verbose);
+
+#ifndef NDEBUG
+#define vvtd_info(fmt...) do {\
+if ( viommu_verbose ) \
+gprintk(XENLOG_INFO, ## fmt); \
+} while(0)
+/*
+ * Use printk and '_G_' prefix because vvtd_debug() may be called
+ * in the context of another domain's vCPU. Don't output 'current'
+ * information to avoid confusion.
+ */
+#define vvtd_debug(fmt...) do {   \
+if ( viommu_verbose && printk_ratelimit())\
+printk(XENLOG_G_DEBUG fmt);   \
+} while(0)
+#else
+#define vvtd_info(...) do {} while(0)
+#define vvtd_de

[Xen-devel] [PATCH v4 00/28] add vIOMMU support with irq remapping function of virtual VT-d

2017-11-16 Thread Chao Gao
This patchset is to introduce vIOMMU framework and add virtual VTD's
interrupt remapping support according "Xen virtual IOMMU high level
design doc V3"(https://lists.xenproject.org/archives/html/xen-devel/
2016-11/msg01391.html).

- vIOMMU framework
New framework provides viommu_ops and help functions to abstract
vIOMMU operations(E,G create, destroy, handle irq remapping request
and so on). Vendors(Intel, ARM, AMD and son) can implement their
vIOMMU callbacks.

- Virtual VTD
We enable irq remapping function and covers both
MSI and IOAPIC interrupts. Don't support post interrupt mode emulation
and post interrupt mode enabled on host with virtual VTD. will add
later.

In case of conflicts, this series also can be found in my personal github:
Xen: https://github.com/gc1008/viommu_xen.git vIOMMU4
Qemu: https://github.com/gc1008/viommu_qemu.git vIOMMU3

Any comments would be highly appreciated. And below is change history.

Changes since v3:
 - add logic to build DMAR table for PVH guest. But only very limited tests are
 performed on PVH guest.
 - use one interface to binding guest remappable and non-remappable interrupts
 with physical interrupts. To achieve this, current binding interface should
 be changed. The advantage is that it can simplify the code to support the
 new format guest interrupt. But the disadvantage is clearly incompatible with
 old QEMU.
 - VT-d posted interrupt feature can be used to deliver guest remappable
 interrupt. The guest interrupt attributes (vector, affinity) are decoded from
 guest IRTE and then accordingly written to host IRTE. In this version, when
 guest invalidates an IRTE, the host IRTE will be updated according to the
 new guest IRTE.
 - add draining in-flight interrupt support. When guest invalidates
 an IRTE, the in-flight interrupt related to the IRTE should be drained.
 This version provides a very simple solution: process QI only when no
 interrupt is delivering which definitely implies there is no in-flight
 interrupt.
 - use locks in QI and fault handling sub-feature. These locks guarantee
 the registers/status won't be changed by guest when vvtd is dealing with
 faults or invalidation requests.
 - move viommu structure under hvm domain rather than making it a field of
 struct domain.
 - remove unneeded domctl interface for destroying viommu. Currently, dynamic
 destruction aren't needed.
 - reorder the patches per Roger's suggestion: the viommu abstract goes first,
 then the implementation of emulated VT-d, several hooks of
 configuring/delivering guest interrupt and EOI, and related changes of
 toolstack.
 - fix many coding style issues pointed out by Roger.

Change since v2:
   1) Remove vIOMMU hypercall of query capabilities and introduce when 
necessary.
   2) Remove length field of vIOMMU create parameter of vIOMMU hypercall
   3) Introduce irq remapping mode callback to vIOMMU framework and vIOMMU 
device models
can check irq remapping mode by vendor specific ways.
   4) Update vIOMMU docs.
   5) Other changes please see patches' change logs.

Change since v1:
   1) Fix coding style issues
   2) Add definitions for vIOMMU type and capabilities
   3) Change vIOMMU kconfig and select vIOMMU default on x86
   4) Put vIOMMU creation in libxl__arch_domain_create()
   5) Make vIOMMU structure of tool stack more general for both PV and HVM.

Change since RFC v2:
   1) Move vvtd.c to drivers/passthrough/vtd directroy. 
   2) Make vIOMMU always built in on x86
   3) Add new boot cmd "viommu" to enable viommu function
   4) Fix some code stype issues.

Change since RFC v1:
   1) Add Xen virtual IOMMU doc docs/misc/viommu.txt
   2) Move vIOMMU hypercall of create/destroy vIOMMU and query  
capabilities from dmop to domctl suggested by Paul Durrant. Because
these hypercalls can be done in tool stack and more VM mode(E,G PVH
or other modes don't use Qemu) can be benefit.
   3) Add check of input MMIO address and length.
   4) Add iommu_type in vIOMMU hypercall parameter to specify
vendor vIOMMU device model(E,G Intel VTD, AMD or ARM IOMMU. So far
only support Intel VTD).
   5) Add save and restore support for vvtd

Chao Gao (23):
  vtd: clean-up and preparation for vvtd
  x86/hvm: Introduce a emulated VTD for HVM
  x86/vvtd: Add MMIO handler for VVTD
  x86/vvtd: Set Interrupt Remapping Table Pointer through GCMD
  x86/vvtd: Enable Interrupt Remapping through GCMD
  x86/vvtd: Process interrupt remapping request
  x86/vvtd: decode interrupt attribute from IRTE
  x86/vvtd: add a helper function to decide the interrupt format
  x86/vvtd: Handle interrupt translation faults
  x86/vvtd: Enable Queued Invalidation through GCMD
  x86/vvtd: Add queued invalidation (QI) support
  x86/vvtd: save and restore emulated VT-d
  x86/vioapic: Hook interrupt delivery of vIOAPIC
  x86/vioapic: extend vioapic_get_vector() to support remapping format
RTE
  xen/pt: when binding guest msi, 

[Xen-devel] [PATCH v4 03/28] VIOMMU: Add irq request callback to deal with irq remapping

2017-11-16 Thread Chao Gao
From: Lan Tianyu <tianyu@intel.com>

This patch is to add irq request callback for platform implementation
to deal with irq remapping request.

Signed-off-by: Lan Tianyu <tianyu@intel.com>
Signed-off-by: Chao Gao <chao@intel.com>
---
 xen/common/viommu.c  | 15 
 xen/include/asm-x86/viommu.h | 54 
 xen/include/xen/viommu.h |  6 +
 3 files changed, 75 insertions(+)
 create mode 100644 xen/include/asm-x86/viommu.h

diff --git a/xen/common/viommu.c b/xen/common/viommu.c
index fd8b7fd..53d4b70 100644
--- a/xen/common/viommu.c
+++ b/xen/common/viommu.c
@@ -114,6 +114,21 @@ int viommu_domctl(struct domain *d, struct 
xen_domctl_viommu_op *op)
 return rc;
 }
 
+int viommu_handle_irq_request(const struct domain *d,
+  const struct arch_irq_remapping_request *request)
+{
+struct viommu *viommu = d->arch.hvm_domain.viommu;
+
+if ( !viommu )
+return -ENODEV;
+
+ASSERT(viommu->ops);
+if ( !viommu->ops->handle_irq_request )
+return -EINVAL;
+
+return viommu->ops->handle_irq_request(d, request);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/include/asm-x86/viommu.h b/xen/include/asm-x86/viommu.h
new file mode 100644
index 000..01ec80e
--- /dev/null
+++ b/xen/include/asm-x86/viommu.h
@@ -0,0 +1,54 @@
+/*
+ * include/asm-x86/viommu.h
+ *
+ * Copyright (c) 2017 Intel Corporation.
+ * Author: Lan Tianyu <tianyu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __ARCH_X86_VIOMMU_H__
+#define __ARCH_X86_VIOMMU_H__
+
+/* IRQ request type */
+enum viommu_irq_request_type {
+VIOMMU_REQUEST_IRQ_MSI = 0,
+VIOMMU_REQUEST_IRQ_APIC = 1
+};
+
+struct arch_irq_remapping_request
+{
+union {
+/* MSI */
+struct {
+uint64_t addr;
+uint32_t data;
+} msi;
+/* Redirection Entry in IOAPIC */
+uint64_t rte;
+} msg;
+uint16_t source_id;
+enum viommu_irq_request_type type;
+};
+
+#endif /* __ARCH_X86_VIOMMU_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/xen/viommu.h b/xen/include/xen/viommu.h
index a859d80..67e25d5 100644
--- a/xen/include/xen/viommu.h
+++ b/xen/include/xen/viommu.h
@@ -22,12 +22,16 @@
 
 #ifdef CONFIG_VIOMMU
 
+#include 
+
 struct viommu;
 
 struct viommu_ops {
 uint8_t type;
 int (*create)(struct domain *d, struct viommu *viommu);
 int (*destroy)(struct viommu *viommu);
+int (*handle_irq_request)(const struct domain *d,
+  const struct arch_irq_remapping_request 
*request);
 };
 
 struct viommu {
@@ -44,6 +48,8 @@ struct viommu {
 int viommu_register_type(uint8_t type, struct viommu_ops *ops);
 int viommu_destroy_domain(struct domain *d);
 int viommu_domctl(struct domain *d, struct xen_domctl_viommu_op *op);
+int viommu_handle_irq_request(const struct domain *d,
+  const struct arch_irq_remapping_request 
*request);
 #else
 static inline int viommu_destroy_domain(struct domain *d)
 {
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 09/28] x86/vvtd: Set Interrupt Remapping Table Pointer through GCMD

2017-11-16 Thread Chao Gao
Software sets SIRTP field of GCMD to set/update the interrupt remapping
table pointer used by hardware. The interrupt remapping table pointer is
specified through the Interrupt Remapping Table Address (IRTA_REG)
register.

This patch emulates this operation and adds some new fields in VVTD to track
info (e.g. the table's gfn and max supported entries) of interrupt remapping
table.

Signed-off-by: Chao Gao <chao@intel.com>
Signed-off-by: Lan Tianyu <tianyu@intel.com>

---
v4:
 - declare eim_enabled as bool and irt as gfn_t
 - rename vvtd_handle_gcmd_sirtp() to write_gcmd_sirtp()

v3:
 - ignore unaligned r/w of vt-d hardware registers and return X86EMUL_OK
---
 xen/drivers/passthrough/vtd/iommu.h | 16 ++-
 xen/drivers/passthrough/vtd/vvtd.c  | 86 +
 2 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/xen/drivers/passthrough/vtd/iommu.h 
b/xen/drivers/passthrough/vtd/iommu.h
index f2ef3dd..8579843 100644
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -48,7 +48,8 @@
 #define DMAR_IQT_REG0x88 /* invalidation queue tail */
 #define DMAR_IQA_REG0x90 /* invalidation queue addr */
 #define DMAR_IECTL_REG  0xa0 /* invalidation event control register */
-#define DMAR_IRTA_REG   0xb8 /* intr remap */
+#define DMAR_IRTA_REG   0xb8 /* base address of intr remap table */
+#define DMAR_IRTUA_REG  0xbc /* upper address of intr remap table */
 
 #define OFFSET_STRIDE(9)
 #define dmar_readl(dmar, reg) readl((dmar) + (reg))
@@ -150,6 +151,9 @@
 #define DMA_GCMD_SIRTP  (((u64)1) << 24)
 #define DMA_GCMD_CFI(((u64)1) << 23)
 
+/* mask of one-shot bits */
+#define DMA_GCMD_ONE_SHOT_MASK 0x96ff
+
 /* GSTS_REG */
 #define DMA_GSTS_TES(((u64)1) << 31)
 #define DMA_GSTS_RTPS   (((u64)1) << 30)
@@ -157,10 +161,18 @@
 #define DMA_GSTS_AFLS   (((u64)1) << 28)
 #define DMA_GSTS_WBFS   (((u64)1) << 27)
 #define DMA_GSTS_QIES   (((u64)1) <<26)
+#define DMA_GSTS_SIRTPS_SHIFT   24
+#define DMA_GSTS_SIRTPS (((u64)1) << DMA_GSTS_SIRTPS_SHIFT)
 #define DMA_GSTS_IRES   (((u64)1) <<25)
-#define DMA_GSTS_SIRTPS (((u64)1) << 24)
 #define DMA_GSTS_CFIS   (((u64)1) <<23)
 
+/* IRTA_REG */
+/* The base of 4KB aligned interrupt remapping table */
+#define DMA_IRTA_ADDR(val)  ((val) & ~0xfffULL)
+/* The size of remapping table is 2^(x+1), where x is the size field in IRTA */
+#define DMA_IRTA_S(val) (val & 0xf)
+#define DMA_IRTA_SIZE(val)  (1UL << (DMA_IRTA_S(val) + 1))
+
 /* PMEN_REG */
 #define DMA_PMEN_EPM(((u32)1) << 31)
 #define DMA_PMEN_PRS(((u32)1) << 0)
diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
b/xen/drivers/passthrough/vtd/vvtd.c
index d78d878..f0476fe 100644
--- a/xen/drivers/passthrough/vtd/vvtd.c
+++ b/xen/drivers/passthrough/vtd/vvtd.c
@@ -36,6 +36,12 @@
 #define VVTD_MAX_OFFSET VVTD_FRCD_END
 
 struct hvm_hw_vvtd {
+bool eim_enabled;
+
+/* Interrupt remapping table base gfn and the max of entries */
+uint16_t irt_max_entry;
+gfn_t irt;
+
 uint32_t regs[VVTD_MAX_OFFSET/sizeof(uint32_t)];
 };
 
@@ -73,6 +79,16 @@ boolean_runtime_param("viommu_verbose", viommu_verbose);
 
 #define VVTD_REG_POS(vvtd, offset) &(vvtd->hw.regs[offset/sizeof(uint32_t)])
 
+static inline void vvtd_set_bit(struct vvtd *vvtd, uint32_t reg, int nr)
+{
+__set_bit(nr, VVTD_REG_POS(vvtd, reg));
+}
+
+static inline void vvtd_clear_bit(struct vvtd *vvtd, uint32_t reg, int nr)
+{
+__clear_bit(nr, VVTD_REG_POS(vvtd, reg));
+}
+
 static inline void vvtd_set_reg(struct vvtd *vvtd, uint32_t reg, uint32_t 
value)
 {
 *VVTD_REG_POS(vvtd, reg) = value;
@@ -102,6 +118,52 @@ static void *domain_vvtd(const struct domain *d)
 return NULL;
 }
 
+static void write_gcmd_sirtp(struct vvtd *vvtd, uint32_t val)
+{
+uint64_t irta = vvtd_get_reg_quad(vvtd, DMAR_IRTA_REG);
+
+if ( !(val & DMA_GCMD_SIRTP) )
+return;
+
+/*
+ * Hardware clears this bit when software sets the SIRTPS field in
+ * the Global Command register and sets it when hardware completes
+ * the 'Set Interrupt Remap Table Pointer' operation.
+ */
+vvtd_clear_bit(vvtd, DMAR_GSTS_REG, DMA_GSTS_SIRTPS_SHIFT);
+
+if ( gfn_x(vvtd->hw.irt) != PFN_DOWN(DMA_IRTA_ADDR(irta)) ||
+ vvtd->hw.irt_max_entry != DMA_IRTA_SIZE(irta) )
+{
+vvtd->hw.irt = _gfn(PFN_DOWN(DMA_IRTA_ADDR(irta)));
+vvtd->hw.irt_max_entry = DMA_IRTA_SIZE(irta);
+vvtd->hw.eim_enabled = !!(irta & IRTA_EIME);
+vvtd_info("Update IR info (addr=%lx eim=%d size=%d)\n",
+  gfn_x(vvtd->hw.irt), vvtd->hw.eim_enabled,
+  vvtd->hw.irt_max_entry);
+}
+vvtd_set_bit(vvtd, DMAR_GSTS_REG, DMA_GSTS_SIRTPS_SHIFT);
+}
+
+static void vvtd_write_gcmd(struct 

[Xen-devel] [PATCH v4 05/28] VIOMMU: Introduce callback of checking irq remapping mode

2017-11-16 Thread Chao Gao
From: Lan Tianyu <tianyu@intel.com>

This patch is to add callback for vIOAPIC and vMSI to check whether interrupt
remapping is enabled.

Signed-off-by: Lan Tianyu <tianyu@intel.com>
Signed-off-by: Chao Gao <chao@intel.com>
---
 xen/common/viommu.c  | 15 +++
 xen/include/xen/viommu.h |  4 
 2 files changed, 19 insertions(+)

diff --git a/xen/common/viommu.c b/xen/common/viommu.c
index 9eafdef..72173c3 100644
--- a/xen/common/viommu.c
+++ b/xen/common/viommu.c
@@ -145,6 +145,21 @@ int viommu_get_irq_info(const struct domain *d,
 return viommu->ops->get_irq_info(d, request, irq_info);
 }
 
+bool viommu_check_irq_remapping(const struct domain *d,
+const struct arch_irq_remapping_request 
*request)
+{
+const struct viommu *viommu = d->arch.hvm_domain.viommu;
+
+if ( !viommu )
+return false;
+
+ASSERT(viommu->ops);
+if ( !viommu->ops->check_irq_remapping )
+return false;
+
+return viommu->ops->check_irq_remapping(d, request);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/include/xen/viommu.h b/xen/include/xen/viommu.h
index 73b853f..c1dfaec 100644
--- a/xen/include/xen/viommu.h
+++ b/xen/include/xen/viommu.h
@@ -29,6 +29,8 @@ struct viommu;
 struct viommu_ops {
 uint8_t type;
 int (*create)(struct domain *d, struct viommu *viommu);
+bool (*check_irq_remapping)(const struct domain *d,
+const struct arch_irq_remapping_request 
*request);
 int (*destroy)(struct viommu *viommu);
 int (*handle_irq_request)(const struct domain *d,
   const struct arch_irq_remapping_request 
*request);
@@ -56,6 +58,8 @@ int viommu_handle_irq_request(const struct domain *d,
 int viommu_get_irq_info(const struct domain *d,
 const struct arch_irq_remapping_request *request,
 struct arch_irq_remapping_info *irq_info);
+bool viommu_check_irq_remapping(const struct domain *d,
+const struct arch_irq_remapping_request 
*request);
 #else
 static inline int viommu_destroy_domain(struct domain *d)
 {
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 04/28] VIOMMU: Add get irq info callback to convert irq remapping request

2017-11-16 Thread Chao Gao
From: Lan Tianyu <tianyu@intel.com>

This patch is to add get_irq_info callback for platform implementation
to convert irq remapping request to irq info (E,G vector, dest, dest_mode
and so on).

Signed-off-by: Lan Tianyu <tianyu@intel.com>
Signed-off-by: Chao Gao <chao@intel.com>
---
 xen/common/viommu.c  | 16 
 xen/include/asm-x86/viommu.h |  8 
 xen/include/xen/viommu.h |  6 ++
 3 files changed, 30 insertions(+)

diff --git a/xen/common/viommu.c b/xen/common/viommu.c
index 53d4b70..9eafdef 100644
--- a/xen/common/viommu.c
+++ b/xen/common/viommu.c
@@ -129,6 +129,22 @@ int viommu_handle_irq_request(const struct domain *d,
 return viommu->ops->handle_irq_request(d, request);
 }
 
+int viommu_get_irq_info(const struct domain *d,
+const struct arch_irq_remapping_request *request,
+struct arch_irq_remapping_info *irq_info)
+{
+const struct viommu *viommu = d->arch.hvm_domain.viommu;
+
+if ( !viommu )
+return -EINVAL;
+
+ASSERT(viommu->ops);
+if ( !viommu->ops->get_irq_info )
+return -EINVAL;
+
+return viommu->ops->get_irq_info(d, request, irq_info);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/include/asm-x86/viommu.h b/xen/include/asm-x86/viommu.h
index 01ec80e..3d995ba 100644
--- a/xen/include/asm-x86/viommu.h
+++ b/xen/include/asm-x86/viommu.h
@@ -26,6 +26,14 @@ enum viommu_irq_request_type {
 VIOMMU_REQUEST_IRQ_APIC = 1
 };
 
+struct arch_irq_remapping_info
+{
+uint8_t dest_mode:1;
+uint8_t delivery_mode:3;
+uint8_t  vector;
+uint32_t dest;
+};
+
 struct arch_irq_remapping_request
 {
 union {
diff --git a/xen/include/xen/viommu.h b/xen/include/xen/viommu.h
index 67e25d5..73b853f 100644
--- a/xen/include/xen/viommu.h
+++ b/xen/include/xen/viommu.h
@@ -32,6 +32,9 @@ struct viommu_ops {
 int (*destroy)(struct viommu *viommu);
 int (*handle_irq_request)(const struct domain *d,
   const struct arch_irq_remapping_request 
*request);
+int (*get_irq_info)(const struct domain *d,
+const struct arch_irq_remapping_request *request,
+struct arch_irq_remapping_info *info);
 };
 
 struct viommu {
@@ -50,6 +53,9 @@ int viommu_destroy_domain(struct domain *d);
 int viommu_domctl(struct domain *d, struct xen_domctl_viommu_op *op);
 int viommu_handle_irq_request(const struct domain *d,
   const struct arch_irq_remapping_request 
*request);
+int viommu_get_irq_info(const struct domain *d,
+const struct arch_irq_remapping_request *request,
+struct arch_irq_remapping_info *irq_info);
 #else
 static inline int viommu_destroy_domain(struct domain *d)
 {
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v4 02/28] VIOMMU: Add vIOMMU framework and vIOMMU domctl

2017-11-16 Thread Chao Gao
From: Lan Tianyu <tianyu@intel.com>

This patch is to introduce an abstract layer for arch vIOMMU implementation
and vIOMMU domctl to deal with requests from tool stack. Arch vIOMMU code needs 
to
provide callback. vIOMMU domctl supports to create vIOMMU instance in hypervisor
and it will be destroyed during destroying domain.

Signed-off-by: Lan Tianyu <tianyu@intel.com>
Signed-off-by: Chao Gao <chao@intel.com>
---
v4:
 - introduce REGISTER_VIOMMU() to register viommu types and ops.
 - remove unneeded domctl interface to destroy viommu.
---
 docs/misc/xen-command-line.markdown |   7 ++
 xen/arch/x86/Kconfig|   1 +
 xen/arch/x86/hvm/hvm.c  |   3 +
 xen/arch/x86/xen.lds.S  |   3 +
 xen/common/Kconfig  |   3 +
 xen/common/Makefile |   1 +
 xen/common/domctl.c |   7 ++
 xen/common/viommu.c | 125 
 xen/include/asm-x86/hvm/domain.h|   3 +
 xen/include/public/domctl.h |  31 +
 xen/include/xen/viommu.h|  69 
 11 files changed, 253 insertions(+)
 create mode 100644 xen/common/viommu.c
 create mode 100644 xen/include/xen/viommu.h

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index eb4995e..d097382 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1836,3 +1836,10 @@ mode.
 > Default: `true`
 
 Permit use of the `xsave/xrstor` instructions.
+
+### viommu
+> `= `
+
+> Default: `false`
+
+Permit use of viommu interface to create and destroy viommu device model.
diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
index 64955dc..df254e4 100644
--- a/xen/arch/x86/Kconfig
+++ b/xen/arch/x86/Kconfig
@@ -25,6 +25,7 @@ config X86
select HAS_UBSAN
select NUMA
select VGA
+   select VIOMMU
 
 config ARCH_DEFCONFIG
string
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 205b4cb..964418a 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -693,6 +694,8 @@ void hvm_domain_relinquish_resources(struct domain *d)
 pmtimer_deinit(d);
 hpet_deinit(d);
 }
+
+viommu_destroy_domain(d);
 }
 
 void hvm_domain_destroy(struct domain *d)
diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
index d5e8821..7f8d2b8 100644
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -231,6 +231,9 @@ SECTIONS
__start_schedulers_array = .;
*(.data.schedulers)
__end_schedulers_array = .;
+   __start_viommus_array = .;
+   *(.data.viommus)
+   __end_viommus_array = .;
   } :text
 
   .data : {/* Data */
diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index 103ef44..62aaa76 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -52,6 +52,9 @@ config HAS_CHECKPOLICY
string
option env="XEN_HAS_CHECKPOLICY"
 
+config VIOMMU
+   bool
+
 config KEXEC
bool "kexec support"
default y
diff --git a/xen/common/Makefile b/xen/common/Makefile
index 66cc2c8..182b3ac 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -56,6 +56,7 @@ obj-y += time.o
 obj-y += timer.o
 obj-y += trace.o
 obj-y += version.o
+obj-$(CONFIG_VIOMMU) += viommu.o
 obj-y += virtual_region.o
 obj-y += vm_event.o
 obj-y += vmap.o
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index 3c6fa4e..9c5651d 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1155,6 +1156,12 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) 
u_domctl)
  op->u.set_gnttab_limits.maptrack_frames);
 break;
 
+case XEN_DOMCTL_viommu_op:
+ret = viommu_domctl(d, >u.viommu_op);
+if ( !ret )
+copyback = 1;
+break;
+
 default:
 ret = arch_do_domctl(op, d, u_domctl);
 break;
diff --git a/xen/common/viommu.c b/xen/common/viommu.c
new file mode 100644
index 000..fd8b7fd
--- /dev/null
+++ b/xen/common/viommu.c
@@ -0,0 +1,125 @@
+/*
+ * common/viommu.c
+ *
+ * Copyright (c) 2017 Intel Corporation
+ * Author: Lan Tianyu <tianyu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Pu

[Xen-devel] [PATCH v4 01/28] Xen/doc: Add Xen virtual IOMMU doc

2017-11-16 Thread Chao Gao
From: Lan Tianyu <tianyu@intel.com>

This patch is to add Xen virtual IOMMU doc to introduce motivation,
framework, vIOMMU hypercall and xl configuration.

Signed-off-by: Lan Tianyu <tianyu@intel.com>
Signed-off-by: Chao Gao <chao@intel.com>
---
 docs/misc/viommu.txt | 120 +++
 1 file changed, 120 insertions(+)
 create mode 100644 docs/misc/viommu.txt

diff --git a/docs/misc/viommu.txt b/docs/misc/viommu.txt
new file mode 100644
index 000..472d2b5
--- /dev/null
+++ b/docs/misc/viommu.txt
@@ -0,0 +1,120 @@
+Xen virtual IOMMU
+
+Motivation
+==
+Enable more than 128 vcpu support
+
+The current requirements of HPC cloud service requires VM with a high
+number of CPUs in order to achieve high performance in parallel
+computing.
+
+To support >128 vcpus, X2APIC mode in guest is necessary because legacy
+APIC(XAPIC) just supports 8-bit APIC ID. The APIC ID used by Xen is
+CPU ID * 2 (ie: CPU 127 has APIC ID 254, which is the last one available
+in xAPIC mode) and so it only can support 128 vcpus at most. x2APIC mode
+supports 32-bit APIC ID and it requires the interrupt remapping functionality
+of a vIOMMU if the guest wishes to route interrupts to all available vCPUs
+
+PCI MSI/IOAPIC can only send interrupt message containing 8-bit APIC ID,
+which cannot address cpus with >254 APIC ID. Interrupt remapping supports
+32-bit APIC ID and so it's necessary for >128 vcpus support.
+
+vIOMMU Architecture
+===
+vIOMMU device model is inside Xen hypervisor for following factors
+1) Avoid round trips between Qemu and Xen hypervisor
+2) Ease of integration with the rest of hypervisor
+3) PVH doesn't use Qemu
+
+* Interrupt remapping overview.
+Interrupts from virtual devices and physical devices are delivered
+to vLAPIC from vIOAPIC and vMSI. vIOMMU needs to remap interrupt during
+this procedure.
+
++---+
+|Qemu   |VM |
+|   | ++|
+|   | |  Device driver ||
+|   | ++---+|
+|   |  ^|
+|   ++  | ++---+|
+|   | Virtual device |  | |  IRQ subsystem ||
+|   +---++  | ++---+|
+|   |   |  ^|
+|   |   |  ||
++---+---+
+|hypervisor |  | VIRQ   |
+|   |+-++   |
+|   ||  vLAPIC  |   |
+|   |VIRQ+-++   |
+|   |  ^|
+|   |  ||
+|   |+-++   |
+|   ||  vIOMMU  |   |
+|   |+-++   |
+|   |  ^|
+|   |  ||
+|   |+-++   |
+|   ||   vIOAPIC/vMSI   |   |
+|   |++++   |
+|   | ^^|
+|   +-+||
+|  ||
++---+
+HW |IRQ
++---+
+|   PCI Device  |
++---+
+
+
+vIOMMU hypercall
+
+Introduce a new domctl hypercall "xen_domctl_viommu_op" to create
+vIOMMUs instance in hypervisor. vIOMMU instance will be destroyed
+during destroying domain.
+
+* vIOMMU hypercall parameter structure
+
+/* vIOMMU type - specify vendor vIOMMU device model */
+#define VIOMMU_TYPE_INTEL_VTD 0
+
+/* vIOMMU capabilities */
+#define VIOMMU_CAP_IRQ_REMAPPING  (1u << 0)
+
+struct xen_domctl_viommu_op {
+uint32_t cmd;
+#define XEN_DOMCTL_viommu_create  0
+union {
+struct {
+/* IN - vIOMMU type  */
+uint8_t type;
+/* IN - MMIO base address of vIOMMU. */
+uint64_t base_address;
+/* IN - Capabilities with which we want to create */
+uint64_t capabilities;
+/* OUT - vIOMMU identity */
+uint32_t id;
+} create;
+} u;
+};
+
+- XEN_DOMCTL_create_viommu
+Create vIOMMU device with type, capabilities and MMIO base address.
+Hypervisor allocates viommu_id for new vIOMMU instance and return back.
+The vIOMMU device model in hypervisor should check whether it can
+support the input capabilitie

Re: [Xen-devel] [PATCH V3 28/29] x86/vvtd: Add queued invalidation (QI) support

2017-10-23 Thread Chao Gao
On Mon, Oct 23, 2017 at 09:57:16AM +0100, Roger Pau Monné wrote:
>On Mon, Oct 23, 2017 at 03:50:24PM +0800, Chao Gao wrote:
>> On Fri, Oct 20, 2017 at 12:20:06PM +0100, Roger Pau Monné wrote:
>> >On Thu, Sep 21, 2017 at 11:02:09PM -0400, Lan Tianyu wrote:
>> >> From: Chao Gao <chao@intel.com>
>> >> +}
>> >> +
>> >> +unmap_guest_page((void*)qinval_page);
>> >> +return ret;
>> >> +
>> >> + error:
>> >> +unmap_guest_page((void*)qinval_page);
>> >> +gdprintk(XENLOG_ERR, "Internal error in Queue Invalidation.\n");
>> >> +domain_crash(vvtd->domain);
>> >
>> >Do you really need to crash the domain in such case?
>> 
>> We reach here when guest requests some operations vvtd doesn't claim
>> supported or emulated. I am afraid it also can be triggered by guest.
>> How about ignoring the invalidation request?
>
>What would real hardware do in such case?

After reading the spec again, I think hardware may generate a fault
event, seeing VT-d spec 10.4.9 Fault Status Register: 
Hardware detected an error associated with the invalidation queue. This
could be due to either a hardware error while fetching a descriptor from
the invalidation queue, or hardware detecting an erroneous or invalid
descriptor in the invalidation queue. At this time, a fault event may be
generated based on the programming of the Fault Event Control register

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V3 28/29] x86/vvtd: Add queued invalidation (QI) support

2017-10-23 Thread Chao Gao
On Fri, Oct 20, 2017 at 12:20:06PM +0100, Roger Pau Monné wrote:
>On Thu, Sep 21, 2017 at 11:02:09PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>> 
>> Queued Invalidation Interface is an expanded invalidation interface with
>> extended capabilities. Hardware implementations report support for queued
>> invalidation interface through the Extended Capability Register. The queued
>> invalidation interface uses an Invalidation Queue (IQ), which is a circular
>> buffer in system memory. Software submits commands by writing Invalidation
>> Descriptors to the IQ.
>> 
>> In this patch, a new function viommu_process_iq() is used for emulating how
>> hardware handles invalidation requests through QI.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>> ---
>> +static int process_iqe(struct vvtd *vvtd, int i)
>
>unsigned int.
>
>> +{
>> +uint64_t iqa;
>> +struct qinval_entry *qinval_page;
>> +int ret = 0;
>> +
>> +iqa = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
>> +qinval_page = map_guest_page(vvtd->domain, 
>> DMA_IQA_ADDR(iqa)>>PAGE_SHIFT);
>
>PFN_DOWN instead of open coding the shift. Both can be initialized
>at declaration. Also AFAICT iqa is only used once, so the local
>variable is not needed.
>
>> +if ( IS_ERR(qinval_page) )
>> +{
>> +gdprintk(XENLOG_ERR, "Can't map guest IRT (rc %ld)",
>> + PTR_ERR(qinval_page));
>> +return PTR_ERR(qinval_page);
>> +}
>> +
>> +switch ( qinval_page[i].q.inv_wait_dsc.lo.type )
>> +{
>> +case TYPE_INVAL_WAIT:
>> +if ( qinval_page[i].q.inv_wait_dsc.lo.sw )
>> +{
>> +uint32_t data = qinval_page[i].q.inv_wait_dsc.lo.sdata;
>> +uint64_t addr = (qinval_page[i].q.inv_wait_dsc.hi.saddr << 2);
>
>Unneeded parentheses.
>
>> +
>> +ret = hvm_copy_to_guest_phys(addr, , sizeof(data), 
>> current);
>> +if ( ret )
>> +vvtd_info("Failed to write status address");
>
>Don't you need to return or do something here? (like raise some kind
>of error?)

The 'addr' is programmed by guest. Here vvtd cannot finish this write
for some reason (i.e. the 'addr' may be not in the guest physical memory space).
According to VT-d spec 6.5.2.8 Invalidation Wait Descriptor, "Hardware
behavior is undefined if the Status Address specified is not an address
route-able to memory (such as peer address, interrupt address range of
0xFEEX_ etc.) I think that Xen can just ignore it. I should use
vvtd_debug() for it is guest triggerable.

>> +if ( !vvtd_test_bit(vvtd, DMAR_IECTL_REG, 
>> DMA_IECTL_IM_SHIFT) )
>> +{
>> +ie_data = vvtd_get_reg(vvtd, DMAR_IEDATA_REG);
>> +ie_addr = vvtd_get_reg(vvtd, DMAR_IEADDR_REG);
>> +vvtd_generate_interrupt(vvtd, ie_addr, ie_data);
>
>...you don't seem two need the two local variables. They are used only
>once.
>
>> +vvtd_clear_bit(vvtd, DMAR_IECTL_REG, 
>> DMA_IECTL_IP_SHIFT);
>> +}
>> +}
>> +}
>> +break;
>> +
>> +case TYPE_INVAL_IEC:
>> +/*
>> + * Currently, no cache is preserved in hypervisor. Only need to 
>> update
>> + * pIRTEs which are modified in binding process.
>> + */
>> +break;
>> +
>> +default:
>> +goto error;
>
>There's no reason to use a label that's only used for the default
>case. Simply place the code in the error label here.
>
>> +}
>> +
>> +unmap_guest_page((void*)qinval_page);
>> +return ret;
>> +
>> + error:
>> +unmap_guest_page((void*)qinval_page);
>> +gdprintk(XENLOG_ERR, "Internal error in Queue Invalidation.\n");
>> +domain_crash(vvtd->domain);
>
>Do you really need to crash the domain in such case?

We reach here when guest requests some operations vvtd doesn't claim
supported or emulated. I am afraid it also can be triggered by guest.
How about ignoring the invalidation request?

I will change the error message for it isn't internal error.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V3 15/29] x86/vvtd: Process interrupt remapping request

2017-10-23 Thread Chao Gao
On Fri, Oct 20, 2017 at 11:01:03AM +0100, Roger Pau Monné wrote:
>On Fri, Oct 20, 2017 at 01:16:37PM +0800, Chao Gao wrote:
>> On Thu, Oct 19, 2017 at 03:26:30PM +0100, Roger Pau Monné wrote:
>> >On Thu, Sep 21, 2017 at 11:01:56PM -0400, Lan Tianyu wrote:
>> >> +static void unmap_guest_page(void *virt)
>> >> +{
>> >> +struct page_info *page;
>> >> +
>> >> +ASSERT((unsigned long)virt & PAGE_MASK);
>> >
>> >I'm not sure I get the point of the check above.
>> 
>> I intended to check the address is 4K-page aligned. It should be
>> 
>> ASSERT(!((unsigned long)virt & (PAGE_SIZE - 1)))
>
>Please use the IS_ALIGNED macro.

Ok.

>
>> >
>> >> +}
>> >> +return;
>> >> +}
>> >> +
>> >> +static bool vvtd_irq_request_sanity_check(const struct vvtd *vvtd,
>> >> +  struct 
>> >> arch_irq_remapping_request *irq)
>> >> +{
>> >> +if ( irq->type == VIOMMU_REQUEST_IRQ_APIC )
>> >> +{
>> >> +struct IO_APIC_route_remap_entry rte = { .val = irq->msg.rte };
>> >> +
>> >> +ASSERT(rte.format);
>> >
>> >Is it fine to ASSERT here? Can't the guest set rte.format to whatever
>> >it wants?
>> 
>> Guest can use legacy format interrupt (i.e. rte.format = 0). However,
>> we only reach here when callback 'check_irq_remapping' return true and
>> for vvtd, 'check_irq_remapping' just returns the format bit of irq request.
>> If here ret.format isn't true, there must be a bug in our code.
>
>Are you sure the correct locks are hold here to prevent the guest
>from changing rte while all this processing is happening?

The rte here isn't the registers in IOAPIC. It is only (or part of) the
interrupt request (abstract of ioapic rte and msi message). Every time
an interrupt is to be delivered, the interrupt request is composed
according the IOAPIC RTE or MSI message on stack. Then we recognize
the format of the interrupt, means remapping format or not remapping
format. Only for remapping format, the function is called. For
non-remapping format, the interrupt is delivered by ioapic directly and
needn't come here and be translated by vIOMMU.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v3 for 4.10] x86/vpt: guarantee the return value of pt_update_irq() set in vIRR or PIR

2017-10-20 Thread Chao Gao
pt_update_irq() is expected to return the vector number of periodic
timer interrupt, which should be set in vIRR of vlapic or in PIR.
Otherwise it would trigger the assertion in vmx_intr_assist(), please
seeing 
https://lists.xenproject.org/archives/html/xen-devel/2017-10/msg00915.html.

But it fails to achieve that in the following two case:
1. hvm_isa_irq_assert() may not set the corresponding bit in vIRR for
mask field of IOAPIC RTE is set. Please refer to the call tree
vmx_intr_assist() -> pt_update_irq() -> hvm_isa_irq_assert() ->
assert_irq() -> assert_gsi() -> vioapic_irq_positive_edge(). The patch
checks whether the vector is set or not in vIRR of vlapic or PIR before
returning.

2. someone changes the vector field of IOAPIC RTE between asserting
the irq and getting the vector of the irq, leading to setting the
old vector number but returning a different vector number. This patch
allows hvm_isa_irq_assert() to accept a callback which can get the
interrupt vector with irq_lock held. Thus, no one can change the vector
between the two operations.

BTW, the first argument of pi_test_and_set_pir() should be uint8_t
and I take this chance to fix it.

Signed-off-by: Chao Gao <chao@intel.com>
---
To Julien:
This patch is to fix a possible cause of an assertion failure related to
periodic timer interrupt. OSSTEST reports regression occasionally when the bug
happens. I intend to merge this patch in 4.10 and then observe whether
the bug disappears or not.

---
passed the two simple xtf tests in 
https://lists.xenproject.org/archives/html/xen-devel/2017-10/msg00915.html.
, which are designed to produce the above two cases.

v3:
- change the first argument of pi_test_pir() to uint8_t
- change the first argument of pi_test_and_set_pir() to uint8_t
- return -1 when no callback is passed to hvm_isa_irq_assert()
- check hvm_isa_irq_assert(.., vioapic_get_vector) in case the callback failed

v2:
- add a callback to hvm_isa_irq_assert() to avoid code duplication
- Constify vlapic argument of vlapic_test_irq()

---
 xen/arch/x86/hvm/dm.c |  2 +-
 xen/arch/x86/hvm/irq.c| 11 +--
 xen/arch/x86/hvm/pmtimer.c|  2 +-
 xen/arch/x86/hvm/rtc.c|  2 +-
 xen/arch/x86/hvm/vlapic.c | 12 
 xen/arch/x86/hvm/vmx/vmx.c|  7 +++
 xen/arch/x86/hvm/vpt.c| 39 ++-
 xen/include/asm-x86/hvm/hvm.h |  1 +
 xen/include/asm-x86/hvm/irq.h | 12 ++--
 xen/include/asm-x86/hvm/vlapic.h  |  1 +
 xen/include/asm-x86/hvm/vmx/vmx.h |  7 ++-
 11 files changed, 75 insertions(+), 21 deletions(-)

diff --git a/xen/arch/x86/hvm/dm.c b/xen/arch/x86/hvm/dm.c
index 32ade95..a787f43 100644
--- a/xen/arch/x86/hvm/dm.c
+++ b/xen/arch/x86/hvm/dm.c
@@ -143,7 +143,7 @@ static int set_isa_irq_level(struct domain *d, uint8_t 
isa_irq,
 hvm_isa_irq_deassert(d, isa_irq);
 break;
 case 1:
-hvm_isa_irq_assert(d, isa_irq);
+hvm_isa_irq_assert(d, isa_irq, NULL);
 break;
 default:
 return -EINVAL;
diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c
index e425df9..0077f68 100644
--- a/xen/arch/x86/hvm/irq.c
+++ b/xen/arch/x86/hvm/irq.c
@@ -168,11 +168,13 @@ void hvm_gsi_deassert(struct domain *d, unsigned int gsi)
 spin_unlock(>arch.hvm_domain.irq_lock);
 }
 
-void hvm_isa_irq_assert(
-struct domain *d, unsigned int isa_irq)
+int hvm_isa_irq_assert(struct domain *d, unsigned int isa_irq,
+   int (*get_vector)(const struct domain *d,
+ unsigned int gsi))
 {
 struct hvm_irq *hvm_irq = hvm_domain_irq(d);
 unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq);
+int vector = -1;
 
 ASSERT(isa_irq <= 15);
 
@@ -182,7 +184,12 @@ void hvm_isa_irq_assert(
  (hvm_irq->gsi_assert_count[gsi]++ == 0) )
 assert_irq(d, gsi, isa_irq);
 
+if ( get_vector )
+vector = get_vector(d, gsi);
+
 spin_unlock(>arch.hvm_domain.irq_lock);
+
+return vector;
 }
 
 void hvm_isa_irq_deassert(
diff --git a/xen/arch/x86/hvm/pmtimer.c b/xen/arch/x86/hvm/pmtimer.c
index b70c299..435647f 100644
--- a/xen/arch/x86/hvm/pmtimer.c
+++ b/xen/arch/x86/hvm/pmtimer.c
@@ -61,7 +61,7 @@ static void pmt_update_sci(PMTState *s)
 ASSERT(spin_is_locked(>lock));
 
 if ( acpi->pm1a_en & acpi->pm1a_sts & SCI_MASK )
-hvm_isa_irq_assert(s->vcpu->domain, SCI_IRQ);
+hvm_isa_irq_assert(s->vcpu->domain, SCI_IRQ, NULL);
 else
 hvm_isa_irq_deassert(s->vcpu->domain, SCI_IRQ);
 }
diff --git a/xen/arch/x86/hvm/rtc.c b/xen/arch/x86/hvm/rtc.c
index bcfa169..cb75b99 100644
--- a/xen/arch/x86/hvm/rtc.c
+++ b/xen/arch/x86/hvm/rtc.c
@@ -75,7 +75,7 @@ static void rtc_update_irq(RTCState *s)
 s->hw.cmos_data[RTC_REG_C] |= RTC_IRQF;
 if ( rtc_mode_is(s, no_ack) )
 hvm_isa_irq_deassert(vr

Re: [Xen-devel] [PATCH V3 11/29] x86/hvm: Introduce a emulated VTD for HVM

2017-10-20 Thread Chao Gao
On Fri, Oct 20, 2017 at 12:56:03AM -0600, Jan Beulich wrote:
>>>> On 20.10.17 at 04:46, <chao@intel.com> wrote:
>> On Thu, Oct 19, 2017 at 12:20:35PM +0100, Roger Pau Monné wrote:
>>>On Thu, Sep 21, 2017 at 11:01:52PM -0400, Lan Tianyu wrote:
>>>> From: Chao Gao <chao@intel.com>
>>>> 
>>>> This patch adds create/destroy function for the emulated VTD
>>>> and adapts it to the common VIOMMU abstraction.
>>>> 
>>>> Signed-off-by: Chao Gao <chao@intel.com>
>>>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>>>> ---
>>>>  
>>>> -obj-y += iommu.o
>>>>  obj-y += dmar.o
>>>> -obj-y += utils.o
>>>> -obj-y += qinval.o
>>>>  obj-y += intremap.o
>>>> +obj-y += iommu.o
>>>> +obj-y += qinval.o
>>>>  obj-y += quirks.o
>>>> +obj-y += utils.o
>>>
>>>Why do you need to shuffle the list above?
>> 
>> I placed them in alphabetic order.
>
>Which is appreciated. But this being non-essential for the patch, it
>would avoid (valid) reviewer questions if you said in the description
>this is an intended but non-essential change.

Sure. I will keep this in mind.

>
>>>Also I'm not sure the Intel vIOMMU implementation should live here. As
>>>you can see the path is:
>>>
>>>xen/drivers/passthrough/vtd/
>>>
>>>The vIOMMU is not tied to passthrough at all, so I would rather place
>>>it in:
>
>Hmm, is vIOMMU usable without an actual backing IOMMU?

I think yes. Now, All vIOMMU features are emulated.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V3 26/29] x86/vvtd: Handle interrupt translation faults

2017-10-20 Thread Chao Gao
On Thu, Oct 19, 2017 at 05:31:37PM +0100, Roger Pau Monné wrote:
>On Thu, Sep 21, 2017 at 11:02:07PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>> 
>> Interrupt translation faults are non-recoverable fault. When faults
>> are triggered, it needs to populate fault info to Fault Recording
>> Registers and inject vIOMMU msi interrupt to notify guest IOMMU driver
>> to deal with faults.
>> 
>> This patch emulates hardware's handling interrupt translation
>> faults (more information about the process can be found in VT-d spec,
>> chipter "Translation Faults", section "Non-Recoverable Fault
>> Reporting" and section "Non-Recoverable Logging").
>> Specifically, viommu_record_fault() records the fault information and
>> viommu_report_non_recoverable_fault() reports faults to software.
>> Currently, only Primary Fault Logging is supported and the Number of
>> Fault-recording Registers is 1.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>> ---
>>  xen/drivers/passthrough/vtd/iommu.h |  60 +++--
>>  xen/drivers/passthrough/vtd/vvtd.c  | 252 
>> +++-
>>  2 files changed, 301 insertions(+), 11 deletions(-)
>> 
>> diff --git a/xen/drivers/passthrough/vtd/iommu.h 
>> b/xen/drivers/passthrough/vtd/iommu.h
>> index 790384f..e19b045 100644
>> --- a/xen/drivers/passthrough/vtd/iommu.h
>> +++ b/xen/drivers/passthrough/vtd/iommu.h
>> @@ -198,26 +198,66 @@
>>  #define DMA_CCMD_CAIG_MASK(x) (((u64)x) & ((u64) 0x3 << 59))
>>  
>>  /* FECTL_REG */
>> -#define DMA_FECTL_IM (((u64)1) << 31)
>> +#define DMA_FECTL_IM_SHIFT 31
>> +#define DMA_FECTL_IM (1U << DMA_FECTL_IM_SHIFT)
>> +#define DMA_FECTL_IP_SHIFT 30
>> +#define DMA_FECTL_IP (1U << DMA_FECTL_IP_SHIFT)
>
>Is it fine to change those from uint64_t to unsigned int?

Yes. The FECTL and FSTS are 32-bit registers.

>
>>  
>>  /* FSTS_REG */
>> -#define DMA_FSTS_PFO ((u64)1 << 0)
>> -#define DMA_FSTS_PPF ((u64)1 << 1)
>> -#define DMA_FSTS_AFO ((u64)1 << 2)
>> -#define DMA_FSTS_APF ((u64)1 << 3)
>> -#define DMA_FSTS_IQE ((u64)1 << 4)
>> -#define DMA_FSTS_ICE ((u64)1 << 5)
>> -#define DMA_FSTS_ITE ((u64)1 << 6)
>> -#define DMA_FSTS_FAULTSDMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_AFO | 
>> DMA_FSTS_APF | DMA_FSTS_IQE | DMA_FSTS_ICE | DMA_FSTS_ITE
>> +#define DMA_FSTS_PFO_SHIFT 0
>> +#define DMA_FSTS_PFO (1U << DMA_FSTS_PFO_SHIFT)
>> +#define DMA_FSTS_PPF_SHIFT 1
>> +#define DMA_FSTS_PPF (1U << DMA_FSTS_PPF_SHIFT)
>> +#define DMA_FSTS_AFO (1U << 2)
>> +#define DMA_FSTS_APF (1U << 3)
>> +#define DMA_FSTS_IQE (1U << 4)
>> +#define DMA_FSTS_ICE (1U << 5)
>> +#define DMA_FSTS_ITE (1U << 6)
>
>This seemingly non-functional changes should be done in a separate
>patch.

sure.

>> +static int vvtd_alloc_frcd(struct vvtd *vvtd)
>> +{
>> +int prev;
>> +uint64_t cap = vvtd_get_reg(vvtd, DMAR_CAP_REG);
>> +unsigned int base = cap_fault_reg_offset(cap);
>> +
>> +/* Set the F bit to indicate the FRCD is in use. */
>> +if ( !vvtd_test_and_set_bit(vvtd,
>> +base + vvtd->status.fault_index * 
>> DMA_FRCD_LEN +
>> +DMA_FRCD3_OFFSET, DMA_FRCD_F_SHIFT) )
>> +{
>> +prev = vvtd->status.fault_index;
>> +vvtd->status.fault_index = (prev + 1) % cap_num_fault_regs(cap);
>> +return vvtd->status.fault_index;
>
>I would prefer that you return the index as an unsigned int parameter
>passed by reference rather than as the return value of the function,
>but that might not be the preference of others.

What are the pros and cons?

>> +static int vvtd_record_fault(struct vvtd *vvtd,
>> + struct arch_irq_remapping_request *request,
>> + int reason)
>> +{
>> +struct vtd_fault_record_register frcd;
>> +int fault_index;
>> +
>> +switch(reason)
>> +{
>> +case VTD_FR_IR_REQ_RSVD:
>> +case VTD_FR_IR_INDEX_OVER:
>> +case VTD_FR_IR_ENTRY_P:
>> +case VTD_FR_IR_ROOT_INVAL:
>> +case VTD_FR_IR_IRTE_RSVD:
>> +case VTD_FR_IR_REQ_COMPAT:
>> +case VTD_FR_IR_SID_ERR:
>> +if ( vvtd_test_bit(vvtd, DMAR_FSTS_REG, DMA_FSTS_PFO_SHIFT) )
>> +return X86EMUL_OKAY;
>> +
>> +/* No available Fault Record means Fault overflowed */
>> +fault_index = vvtd_alloc_frcd(vvtd);
>> +if ( fault_index == -1 )
>
>Erm, wouldn't vvtd_alloc_frcd return -ENOMEM in case of error? Ie: you
>should check if ( fault_index < 0 ).

It is a mistake.

Thanks
Chao


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V3 24/29] tools/libxc: Add a new interface to bind remapping format msi with pirq

2017-10-20 Thread Chao Gao
On Thu, Oct 19, 2017 at 05:03:26PM +0100, Roger Pau Monné wrote:
>On Thu, Sep 21, 2017 at 11:02:05PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>
>The title for this patch it's wrong, it modifies both the hypervisor
>and libxc. Please fix it.
>
>> When exposing vIOMMU (vvtd) to guest, guest can configure the msi to
>> remapping format. For pass-through device, the physical interrupt now
>> can be bound with remapping format msi. This patch introduce a flag,
>> HVM_IRQ_DPCI_GUEST_REMAPPED, which indicate a physical interrupt is
>> bound with remapping format guest interrupt. Thus, we can use
>> (HVM_IRQ_DPCI_GUEST_REMAPPED | HVM_IRQ_DPCI_GUEST_MSI) to show the new
>> binding type. Also provide an new interface to manage the new binding.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>> 
>> ---
>> diff --git a/xen/include/asm-x86/hvm/irq.h b/xen/include/asm-x86/hvm/irq.h
>> index bd8a918..4f5d37b 100644
>> --- a/xen/include/asm-x86/hvm/irq.h
>> +++ b/xen/include/asm-x86/hvm/irq.h
>> @@ -121,6 +121,7 @@ struct dev_intx_gsi_link {
>>  #define _HVM_IRQ_DPCI_GUEST_PCI_SHIFT   4
>>  #define _HVM_IRQ_DPCI_GUEST_MSI_SHIFT   5
>>  #define _HVM_IRQ_DPCI_IDENTITY_GSI_SHIFT6
>> +#define _HVM_IRQ_DPCI_GUEST_REMAPPED_SHIFT  7
>>  #define _HVM_IRQ_DPCI_TRANSLATE_SHIFT  15
>>  #define HVM_IRQ_DPCI_MACH_PCI(1u << _HVM_IRQ_DPCI_MACH_PCI_SHIFT)
>>  #define HVM_IRQ_DPCI_MACH_MSI(1u << _HVM_IRQ_DPCI_MACH_MSI_SHIFT)
>> @@ -128,6 +129,7 @@ struct dev_intx_gsi_link {
>>  #define HVM_IRQ_DPCI_EOI_LATCH   (1u << _HVM_IRQ_DPCI_EOI_LATCH_SHIFT)
>>  #define HVM_IRQ_DPCI_GUEST_PCI   (1u << _HVM_IRQ_DPCI_GUEST_PCI_SHIFT)
>>  #define HVM_IRQ_DPCI_GUEST_MSI   (1u << _HVM_IRQ_DPCI_GUEST_MSI_SHIFT)
>> +#define HVM_IRQ_DPCI_GUEST_REMAPPED  (1u << 
>> _HVM_IRQ_DPCI_GUEST_REMAPPED_SHIFT)
>>  #define HVM_IRQ_DPCI_IDENTITY_GSI(1u << 
>> _HVM_IRQ_DPCI_IDENTITY_GSI_SHIFT)
>>  #define HVM_IRQ_DPCI_TRANSLATE   (1u << _HVM_IRQ_DPCI_TRANSLATE_SHIFT)
>
>Please keep this sorted. It should go after the _GSI one.
>
>>  
>> @@ -137,6 +139,11 @@ struct hvm_gmsi_info {
>>  uint32_t gvec;
>>  uint32_t gflags;
>>  } legacy;
>> +struct {
>> +uint32_t source_id;
>> +uint32_t data;
>> +uint64_t addr;
>> +} intremap;
>>  };
>>  int dest_vcpu_id; /* -1 :multi-dest, non-negative: dest_vcpu_id */
>>  bool posted; /* directly deliver to guest via VT-d PI? */
>> diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
>> index 68854b6..8c59cfc 100644
>> --- a/xen/include/public/domctl.h
>> +++ b/xen/include/public/domctl.h
>> @@ -559,6 +559,7 @@ typedef enum pt_irq_type_e {
>>  PT_IRQ_TYPE_MSI,
>>  PT_IRQ_TYPE_MSI_TRANSLATE,
>>  PT_IRQ_TYPE_SPI,/* ARM: valid range 32-1019 */
>> +PT_IRQ_TYPE_MSI_IR,
>
>Introducing a new irq type seems dubious, at the end this is still a
>MSI interrupt.
>
>>  } pt_irq_type_t;
>>  struct xen_domctl_bind_pt_irq {
>>  uint32_t machine_irq;
>> @@ -586,6 +587,12 @@ struct xen_domctl_bind_pt_irq {
>>  uint64_aligned_t gtable;
>>  } msi;
>>  struct {
>> +uint32_t source_id;
>> +uint32_t data;
>> +uint64_t addr;
>> +uint64_t gtable;
>> +} msi_ir;
>
>Have you tried to expand gflags somehow so that you don't need a new
>type together with a new structure?

gflags doesn't have enough bits to contain so much information.

>
>It seems quite cumbersome and also involves adding more handlers to
>libxc.
>
>At the end this is a domctl interface, so you should be able to modify
>it at will.

Considering gtable and gflags are also needed for 'msi_ir', 
modifying the existing interface seems better than adding an new one.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V3 16/29] x86/vvtd: decode interrupt attribute from IRTE

2017-10-20 Thread Chao Gao
On Thu, Oct 19, 2017 at 03:39:44PM +0100, Roger Pau Monné wrote:
>On Thu, Sep 21, 2017 at 11:01:57PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>> 
>> Without interrupt remapping, interrupt attributes can be extracted from
>> msi message or IOAPIC RTE. However, with interrupt remapping enabled,
>> the attributes are enclosed in the associated IRTE. This callback is
>> for cases in which the caller wants to acquire interrupt attributes, for
>> example:
>> 1. vioapic_get_vector(). With vIOMMU, the RTE may don't contain vector.
>^ not
>> 2. perform EOI which is always based on the interrupt vector.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>> ---
>> v3:
>>  - add example cases in which we will use this function.
>> ---
>>  xen/drivers/passthrough/vtd/vvtd.c | 23 ++-
>>  1 file changed, 22 insertions(+), 1 deletion(-)
>> 
>> diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
>> b/xen/drivers/passthrough/vtd/vvtd.c
>> index 90c00f5..5e22ace 100644
>> --- a/xen/drivers/passthrough/vtd/vvtd.c
>> +++ b/xen/drivers/passthrough/vtd/vvtd.c
>> @@ -516,6 +516,26 @@ static int vvtd_handle_irq_request(struct domain *d,
>>   irte.remap.tm);
>>  }
>>  
>> +static int vvtd_get_irq_info(struct domain *d,
>> + struct arch_irq_remapping_request *irq,
>> + struct arch_irq_remapping_info *info)
>> +{
>> +int ret;
>> +struct iremap_entry irte;
>> +struct vvtd *vvtd = domain_vvtd(d);
>
>I've realized that some of the helpers perform a if (!vvtd ) return
>check, while others don't (like this one). Are some handlers expected
>to be called without a vIOMMU?

No. I forgot to check the existence of a vIOMMU here.

Thanks
chao


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V3 15/29] x86/vvtd: Process interrupt remapping request

2017-10-20 Thread Chao Gao
On Thu, Oct 19, 2017 at 03:26:30PM +0100, Roger Pau Monné wrote:
>On Thu, Sep 21, 2017 at 11:01:56PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>> 
>> When a remapping interrupt request arrives, remapping hardware computes the
>> interrupt_index per the algorithm described in VTD spec
>> "Interrupt Remapping Table", interprets the IRTE and generates a remapped
>> interrupt request.
>> 
>> This patch introduces viommu_handle_irq_request() to emulate the process how
>> remapping hardware handles a remapping interrupt request.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>> 
>> ---
>>  
>> +enum VTD_FAULT_TYPE
>> +{
>> +/* Interrupt remapping transition faults */
>> +VTD_FR_IR_REQ_RSVD  = 0x20, /* One or more IR request reserved
>> + * fields set */
>> +VTD_FR_IR_INDEX_OVER= 0x21, /* Index value greater than max */
>> +VTD_FR_IR_ENTRY_P   = 0x22, /* Present (P) not set in IRTE */
>> +VTD_FR_IR_ROOT_INVAL= 0x23, /* IR Root table invalid */
>> +VTD_FR_IR_IRTE_RSVD = 0x24, /* IRTE Rsvd field non-zero with
>> + * Present flag set */
>> +VTD_FR_IR_REQ_COMPAT= 0x25, /* Encountered compatible IR
>> + * request while disabled */
>> +VTD_FR_IR_SID_ERR   = 0x26, /* Invalid Source-ID */
>> +};
>
>Why does this need to be an enum? Plus enum type names should not be
>all in uppercase.
>
>In any case, I would just use defines, like it's done for all other
>values in the file.

Sure. Will follow your suggestion.

>> +static void unmap_guest_page(void *virt)
>> +{
>> +struct page_info *page;
>> +
>> +ASSERT((unsigned long)virt & PAGE_MASK);
>
>I'm not sure I get the point of the check above.

I intended to check the address is 4K-page aligned. It should be

ASSERT(!((unsigned long)virt & (PAGE_SIZE - 1)))

>> +}
>> +
>> +static inline uint32_t irte_dest(struct vvtd *vvtd, uint32_t dest)
>> +{
>> +/* In xAPIC mode, only 8-bits([15:8]) are valid */
>> +return vvtd->status.eim_enabled ? dest
>   : MASK_EXTR(dest, IRTE_xAPIC_DEST_MASK);
>
>It's easier to read style wise.

sure.

>
>> +}
>> +
>>  static void vvtd_handle_gcmd_ire(struct vvtd *vvtd, uint32_t val)
>>  {
>>  vvtd_info("%sable Interrupt Remapping",
>> @@ -255,6 +387,135 @@ static const struct hvm_mmio_ops vvtd_mmio_ops = {
>>  .write = vvtd_write
>>  };
>>  
>> +static void vvtd_handle_fault(struct vvtd *vvtd,
>> +  struct arch_irq_remapping_request *irq,
>> +  struct iremap_entry *irte,
>> +  unsigned int fault,
>> +  bool record_fault)
>> +{
>> +   if ( !record_fault )
>> +return;
>> +
>> +switch ( fault )
>> +{
>> +case VTD_FR_IR_SID_ERR:
>> +case VTD_FR_IR_IRTE_RSVD:
>> +case VTD_FR_IR_ENTRY_P:
>> +if ( qinval_fault_disable(*irte) )
>> +break;
>> +/* fall through */
>> +case VTD_FR_IR_INDEX_OVER:
>> +case VTD_FR_IR_ROOT_INVAL:
>> +/* TODO: handle fault (e.g. record and report this fault to VM */
>> +break;
>> +
>> +default:
>> +gdprintk(XENLOG_INFO, "Can't handle VT-d fault %x\n", fault);
>
>You already defined some vvtd specific debug helpers, why are those
>not used here? gdprintk (as the 'd' denotes) is only for debug
>purposes.

The default case means we encounter a bug in our code. I want to output
this kind of message even for non-debug version. I should use gprintk.

>
>> +}
>> +return;
>> +}
>> +
>> +static bool vvtd_irq_request_sanity_check(const struct vvtd *vvtd,
>> +  struct arch_irq_remapping_request 
>> *irq)
>> +{
>> +if ( irq->type == VIOMMU_REQUEST_IRQ_APIC )
>> +{
>> +struct IO_APIC_route_remap_entry rte = { .val = irq->msg.rte };
>> +
>> +ASSERT(rte.format);
>
>Is it fine to ASSERT here? Can't the guest set rte.format to whatever
>it wants?

Guest can use legacy format interrupt (i.e. rte.format = 0). However,
we only reach here when callback 'check_irq_remapping' return true and
for vvtd, 'check_irq_remapping' just returns the format bit of i

Re: [Xen-devel] [PATCH V3 13/29] x86/vvtd: Set Interrupt Remapping Table Pointer through GCMD

2017-10-19 Thread Chao Gao
On Thu, Oct 19, 2017 at 12:56:45PM +0100, Roger Pau Monné wrote:
>On Thu, Sep 21, 2017 at 11:01:54PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>> 
>> Software sets this field to set/update the interrupt remapping table pointer
>> used by hardware. The interrupt remapping table pointer is specified through
>> the Interrupt Remapping Table Address (IRTA_REG) register.
>> 
>> This patch emulates this operation and adds some new fields in VVTD to track
>> info (e.g. the table's gfn and max supported entries) of interrupt remapping
>> table.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>> 
>> ---
>> @@ -148,6 +205,18 @@ static int vvtd_write(struct vcpu *v, unsigned long 
>> addr,
>>  break;
>>  }
>>  }
>> +else /* len == 8 */
>> +{
>> +switch ( offset )
>> +{
>> +case DMAR_IRTA_REG:
>> +vvtd_set_reg_quad(vvtd, DMAR_IRTA_REG, val);
>
>I have kind of a generic comment regarding the handlers in general,
>which I will just make here. Don't you need some kind of locking to
>prevent concurrent read/write accesses to the registers?

I think guest should be responsible to avoid concurrency.
Xen only needs to not be fooled (crashed) by a malicious guest.

>
>Also the 'if' to handle different sized accesses to the same registers
>seems quite cumbersome. I would think there's a better way to handle
>this with a single switch statement.

Will use only one switch statement and maybe add if-else for the
cases which can be accessed with different size.

Thanks
chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V3 12/29] x86/vvtd: Add MMIO handler for VVTD

2017-10-19 Thread Chao Gao
On Thu, Oct 19, 2017 at 12:34:54PM +0100, Roger Pau Monné wrote:
>On Thu, Sep 21, 2017 at 11:01:53PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>> 
>> This patch adds VVTD MMIO handler to deal with MMIO access.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>> ---
>>  xen/drivers/passthrough/vtd/vvtd.c | 91 
>> ++
>>  1 file changed, 91 insertions(+)
>> 
>> diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
>> b/xen/drivers/passthrough/vtd/vvtd.c
>> index c851ec7..a3002c3 100644
>> --- a/xen/drivers/passthrough/vtd/vvtd.c
>> +++ b/xen/drivers/passthrough/vtd/vvtd.c
>> @@ -47,6 +47,29 @@ struct vvtd {
>>  struct page_info *regs_page;
>>  };
>>  
>> +/* Setting viommu_verbose enables debugging messages of vIOMMU */
>> +bool __read_mostly viommu_verbose;
>> +boolean_runtime_param("viommu_verbose", viommu_verbose);
>> +
>> +#ifndef NDEBUG
>> +#define vvtd_info(fmt...) do {\
>> +if ( viommu_verbose ) \
>> +gprintk(XENLOG_G_INFO, ## fmt);   \
>
>If you use gprintk you should use XENLOG_INFO, the '_G_' variants are
>only used with plain printk.
>
>> +} while(0)
>> +#define vvtd_debug(fmt...) do {   \
>> +if ( viommu_verbose && printk_ratelimit() )   \
>
>Not sure why you need printk_ratelimit, XENLOG_G_DEBUG is already
>rate-limited.
>
>> +printk(XENLOG_G_DEBUG fmt);   \
>
>Any reason why vvtd_info uses gprintk and here you use printk?
>
>> +} while(0)
>> +#else
>> +#define vvtd_info(fmt...) do {} while(0)
>> +#define vvtd_debug(fmt...) do {} while(0)
>
>No need for 'fmt...' just '...' will suffice since you are discarding
>the parameters anyway.
>
>> +#endif
>> +
>> +struct vvtd *domain_vvtd(struct domain *d)
>> +{
>> +return (d->viommu) ? d->viommu->priv : NULL;
>
>Unneeded parentheses around d->viommu.
>
>Also, it seems wring to call domain_vvtd with !d->viommu. So I think
>this helper should just be removed, and d->viommu->priv fetched
>directly.
>
>> +}
>> +
>>  static inline void vvtd_set_reg(struct vvtd *vtd, uint32_t reg, uint32_t 
>> value)
>>  {
>>  vtd->regs->data32[reg/sizeof(uint32_t)] = value;
>> @@ -68,6 +91,73 @@ static inline uint64_t vvtd_get_reg_quad(struct vvtd 
>> *vtd, uint32_t reg)
>>  return vtd->regs->data64[reg/sizeof(uint64_t)];
>>  }
>>  
>> +static int vvtd_in_range(struct vcpu *v, unsigned long addr)
>> +{
>> +struct vvtd *vvtd = domain_vvtd(v->domain);
>> +
>> +if ( vvtd )
>> +return (addr >= vvtd->base_addr) &&
>> +   (addr < vvtd->base_addr + PAGE_SIZE);
>
>So the register set covers a PAGE_SIZE, but hvm_hw_vvtd_regs only
>covers from 0 to 1024B, it seems like there's something wrong here...
>
>> +return 0;
>> +}
>> +
>> +static int vvtd_read(struct vcpu *v, unsigned long addr,
>> + unsigned int len, unsigned long *pval)
>> +{
>> +struct vvtd *vvtd = domain_vvtd(v->domain);
>> +unsigned int offset = addr - vvtd->base_addr;
>> +
>> +vvtd_info("Read offset %x len %d\n", offset, len);
>> +
>> +if ( (len != 4 && len != 8) || (offset & (len - 1)) )
>
>What value does hardware return when performing unaligned reads or
>reads with wrong size?

According to VT-d spec section 10.2, "Software must access 64-bit and
128-bit registers as either aligned quadwords or aligned doublewords".
I am afraid there is no specific hardware action for unaligned access
information. We can treat it as undefined? Then do nothing.
But I did see windows driver has such accesses. We need to add a
workaround for windows later.

>
>Here you return with pval not set, which is dangerous.

Indeed. But I need check whether the pval is initialized by the caller.
If that, it is safe.

>
>> +return X86EMUL_OKAY;
>> +
>> +if ( len == 4 )
>> +*pval = vvtd_get_reg(vvtd, offset);
>> +else
>> +*pval = vvtd_get_reg_quad(vvtd, offset);
>
>...yet here you don't check for offset < 1024.
>
>> +
>> +return X86EMUL_OKAY;
>> +}
>> +
>> +static int vvtd_write(struct vcpu *v, unsigned long addr,
>> +  unsigned int len, unsigned long val)
>> +{
&

Re: [Xen-devel] [PATCH V3 11/29] x86/hvm: Introduce a emulated VTD for HVM

2017-10-19 Thread Chao Gao
On Thu, Oct 19, 2017 at 12:20:35PM +0100, Roger Pau Monné wrote:
>On Thu, Sep 21, 2017 at 11:01:52PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>> 
>> This patch adds create/destroy function for the emulated VTD
>> and adapts it to the common VIOMMU abstraction.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>> ---
>>  
>> -obj-y += iommu.o
>>  obj-y += dmar.o
>> -obj-y += utils.o
>> -obj-y += qinval.o
>>  obj-y += intremap.o
>> +obj-y += iommu.o
>> +obj-y += qinval.o
>>  obj-y += quirks.o
>> +obj-y += utils.o
>
>Why do you need to shuffle the list above?

I placed them in alphabetic order.

>
>Also I'm not sure the Intel vIOMMU implementation should live here. As
>you can see the path is:
>
>xen/drivers/passthrough/vtd/
>
>The vIOMMU is not tied to passthrough at all, so I would rather place
>it in:
>
>xen/drivers/vvtd/
>
>Or maybe you can create something like:
>
>xen/drivers/viommu/
>
>So that all vIOMMU implementations can share some code.
>

vvtd and vtd use the same header files (i.g. vtd.h). That is why we put
it there.  If that, we shoule move the related header files to a public
directory.

>>  #define cap_isoch(c)(((c) >> 23) & 1)
>>  #define cap_qos(c)(((c) >> 22) & 1)
>>  #define cap_mgaw(c)c) >> 16) & 0x3f) + 1)
>> -#define cap_sagaw(c)(((c) >> 8) & 0x1f)
>> +#define cap_set_mgaw(c) c) - 1) & 0x3f) << 16)
>> +#define cap_sagaw(c)(((c) >> DMA_CAP_SAGAW_SHIFT) & 0x1f)
>>  #define cap_caching_mode(c)(((c) >> 7) & 1)
>>  #define cap_phmr(c)(((c) >> 6) & 1)
>>  #define cap_plmr(c)(((c) >> 5) & 1)
>> @@ -104,10 +113,16 @@
>>  #define ecap_niotlb_iunits(e)e) >> 24) & 0xff) + 1)
>>  #define ecap_iotlb_offset(e) e) >> 8) & 0x3ff) * 16)
>>  #define ecap_coherent(e) ((e >> 0) & 0x1)
>> -#define ecap_queued_inval(e) ((e >> 1) & 0x1)
>> +#define DMA_ECAP_QI_SHIFT1
>> +#define DMA_ECAP_QI  (1ULL << DMA_ECAP_QI_SHIFT)
>> +#define ecap_queued_inval(e) ((e >> DMA_ECAP_QI_SHIFT) & 0x1)
>
>Looks like this could be based on MASK_EXTR instead, but seeing how
>the file is full of open-coded mask extracts I'm not sure it's worth
>it anymore.
>
>>  #define ecap_dev_iotlb(e)((e >> 2) & 0x1)
>> -#define ecap_intr_remap(e)   ((e >> 3) & 0x1)
>> -#define ecap_eim(e)  ((e >> 4) & 0x1)
>> +#define DMA_ECAP_IR_SHIFT3
>> +#define DMA_ECAP_IR  (1ULL << DMA_ECAP_IR_SHIFT)
>> +#define ecap_intr_remap(e)   ((e >> DMA_ECAP_IR_SHIFT) & 0x1)
>> +#define DMA_ECAP_EIM_SHIFT   4
>> +#define DMA_ECAP_EIM (1ULL << DMA_ECAP_EIM_SHIFT)
>> +#define ecap_eim(e)  ((e >> DMA_ECAP_EIM_SHIFT) & 0x1)
>
>Maybe worth placing all the DMA_ECAP_* defines in a separate section?
>Seems like how it's done for other features like DMA_FSTS or
>DMA_CCMD.

Got it.

>> +
>> +/* Supported capabilities by vvtd */
>> +unsigned int vvtd_caps = VIOMMU_CAP_IRQ_REMAPPING;
>
>static?
>
>Or even better, why is this not a define like VIOMMU_MAX_CAPS or
>similar.

Yeah. It should be renamed to VVTD_MAX_CAPS.

>
>> +
>> +union hvm_hw_vvtd_regs {
>> +uint32_t data32[256];
>> +uint64_t data64[128];
>> +};
>
>Do you really need to store all the register space instead of only
>storing specific registers?

I prefer to store all the registers for we don't need a trick to map
the real offset in hardware to the index in the array.

>
>> +
>> +struct vvtd {
>> +/* Address range of remapping hardware register-set */
>> +uint64_t base_addr;
>> +uint64_t length;
>
>The length field doesn't seem to be used below.

will remove it.

>
>> +/* Point back to the owner domain */
>> +struct domain *domain;
>> +union hvm_hw_vvtd_regs *regs;
>
>Does this need to be a pointer?

Seems not.
>
>> +struct page_info *regs_page;
>> +};
>> +
>> +static int vvtd_create(struct domain *d, struct viommu *viommu)
>> +{
>> +struct vvtd *vvtd;
>> +int ret;
>> +
>> +if ( !is_hvm_domain(d) || (viommu->base_address & (PAGE_SIZE - 1)) ||
>> +(~vvtd_caps & viommu->caps) )
>> +  

Re: [Xen-devel] [PATCH V3 10/29] vtd: add and align register definitions

2017-10-19 Thread Chao Gao
On Thu, Oct 19, 2017 at 11:21:35AM +0100, Roger Pau Monné wrote:
>On Thu, Sep 21, 2017 at 11:01:51PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>> 
>> No functional changes.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>
>Reviewed-by: Roger Pau Monné <roger@citrix.com>

Thanks

>
>Would have been nice to maybe split this into two, one patch that
>simply fixes the alignment and another one that introduces the new
>defines (or even introduce the new defines when they are actually
>needed).

Will divide it into two parts.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V3 7/29] tools/libxl: build DMAR table for a guest with one virtual VTD

2017-10-19 Thread Chao Gao
On Thu, Oct 19, 2017 at 11:00:27AM +0100, Roger Pau Monné wrote:
>On Thu, Sep 21, 2017 at 11:01:48PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>> 
>> A new logic is added to build ACPI DMAR table in tool stack for a guest
>> with one virtual VTD and pass through it to guest via existing mechanism. If
>> there already are ACPI tables needed to pass through, we joint the tables.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>> 
>> ---
>> +/*
>> + * For hvm, we don't need build acpi in libxl. Instead, it's built in 
>> hvmloader.
>> + * But if one hvm has virtual VTD(s), we build DMAR table for it and joint 
>> this
>> + * table with existing content in acpi_modules in order to employ HVM
>> + * firmware pass-through mechanism to pass-through DMAR table.
>> + */
>> +static int libxl__dom_load_acpi_hvm(libxl__gc *gc,
>> +const libxl_domain_build_info *b_info,
>> +struct xc_dom_image *dom)
>> +{
>
>AFAICT there's some code duplication between libxl__dom_load_acpi_hvm
>and libxl__dom_load_acpi_pvh, isn't there a chance you could put this
>in a common function?

Will give it a shot.

>
>> +struct acpi_config config = { 0 };
>> +struct acpi_ctxt ctxt;
>> +void *table;
>> +uint32_t len;
>> +
>> +if ((b_info->type != LIBXL_DOMAIN_TYPE_HVM) ||
>> +(b_info->device_model_version == LIBXL_DEVICE_MODEL_VERSION_NONE) ||
>> +(b_info->num_viommus != 1) ||
>> +(b_info->viommu[0].type != LIBXL_VIOMMU_TYPE_INTEL_VTD))
>> +return 0;
>> +
>> +ctxt.mem_ops.alloc = acpi_memalign;
>> +ctxt.mem_ops.v2p = virt_to_phys;
>> +ctxt.mem_ops.free = acpi_mem_free;
>> +
>> +if (libxl_defbool_val(b_info->viommu[0].intremap))
>> +config.iommu_intremap_supported = true;
>> +/* x2apic is always enabled since in no case we must disable it */
>> +config.iommu_x2apic_supported = true;
>> +config.iommu_base_addr = b_info->viommu[0].base_addr;
>
>I don't see libxl__dom_load_acpi_pvh setting any of the vIOMMU fields.

I didn't try to enable vIOMMU for PVH. I will attemp to add vIOMMU
support for PVH and put those patches at the end of this series. 

>
>> +int libxl__dom_load_acpi(libxl__gc *gc,
>> + const libxl_domain_build_info *b_info,
>> + struct xc_dom_image *dom)
>> +{
>> +
>> +if (b_info->type != LIBXL_DOMAIN_TYPE_HVM)
>> +return 0;
>
>Keep in mind a new PVH domain type has been introduced recently in
>libxl, you will have to change this to b_info->type == LIBXL_DOMAIN_TYPE_PV.

Thanks for your kind reminder.

Chao


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V3 6/29] tools/libxl: Add a user configurable parameter to control vIOMMU attributes

2017-10-19 Thread Chao Gao
On Thu, Oct 19, 2017 at 10:49:22AM +0100, Roger Pau Monné wrote:
>On Thu, Sep 21, 2017 at 11:01:47PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>> 
>> A field, viommu_info, is added to struct libxl_domain_build_info. Several
>> attributes can be specified by guest config file for virtual IOMMU. These
>> attributes are used for DMAR construction and vIOMMU creation.
>
>IMHO this should come much later in the series, ideally you would
>introduce the xl/libxl code in the last patches, together with the
>xl.cfg man page change.

It can be put to the end of this series. But I prefer to introduce the
vIOMMU from up to down (means the use interface goes first and then how
to implement a vIOMMU step by step) for it may be easier to understand. 

>
>> diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
>> index 9123585..decd7a8 100644
>> --- a/tools/libxl/libxl_create.c
>> +++ b/tools/libxl/libxl_create.c
>> @@ -27,6 +27,8 @@
>>  
>>  #include 
>>  
>> +#define VIOMMU_VTD_BASE_ADDR0xfed9ULL
>
>This should be in libxl_arch.h see LAPIC_BASE_ADDRESS.

Agree.

>
>> +
>>  int libxl__domain_create_info_setdefault(libxl__gc *gc,
>>   libxl_domain_create_info *c_info)
>>  {
>> @@ -59,6 +61,47 @@ void libxl__rdm_setdefault(libxl__gc *gc, 
>> libxl_domain_build_info *b_info)
>>  LIBXL_RDM_MEM_BOUNDARY_MEMKB_DEFAULT;
>>  }
>>  
>> +static int libxl__viommu_set_default(libxl__gc *gc,
>> + libxl_domain_build_info *b_info)
>> +{
>> +int i;
>> +
>> +if (!b_info->num_viommus)
>> +return 0;
>> +
>> +for (i = 0; i < b_info->num_viommus; i++) {
>> +libxl_viommu_info *viommu = _info->viommu[i];
>> +
>> +if (libxl_defbool_is_default(viommu->intremap))
>> +libxl_defbool_set(>intremap, true);
>> +
>> +if (!libxl_defbool_val(viommu->intremap)) {
>> +LOGE(ERROR, "Cannot create one virtual VTD without intremap");
>> +return ERROR_INVAL;
>> +}
>> +
>> +if (viommu->type == LIBXL_VIOMMU_TYPE_INTEL_VTD) {
>> +/*
>> + * If there are multiple vIOMMUs, we need arrange all vIOMMUs to
>> + * avoid overlap. Put a check here in case we get here for 
>> multiple
>> + * vIOMMUs case.
>> + */
>> +if (b_info->num_viommus > 1) {
>> +LOGE(ERROR, "Multiple vIOMMUs support is under 
>> implementation");
>
>s/LOGE/LOG/ LOGE should only be used when errno is set (which is not
>the case here).

yes.

>
>> +return ERROR_INVAL;
>> +}
>> +
>> +/* Set default values to unexposed fields */
>> +viommu->base_addr = VIOMMU_VTD_BASE_ADDR;
>> +
>> +/* Set desired capbilities */
>> +viommu->cap = VIOMMU_CAP_IRQ_REMAPPING;
>
>I'm not sure whether this code should be in libxl_x86.c, but
>libxl__domain_build_info_setdefault is already quite messed up, so I
>guess it's fine.
>
>> +}
>
>Shouldn't this be:
>
>switch(viommu->type) {
>case LIBXL_VIOMMU_TYPE_INTEL_VTD:
>...
>break;
>
>default:
>return ERROR_INVAL;
>}
>
>So that you catch type being set to an invalid vIOMMU type?

sure. Will update.

>
>> +if (d_config->b_info.num_viommus > 1) {
>> +ret = ERROR_INVAL;
>> +LOGD(ERROR, domid, "Cannot support multiple vIOMMUs");
>> +goto error_out;
>> +}
>
>Er, you already have this check in libxl__viommu_set_default, and in
>any case I would just rely on the hypervisor failing to create more
>than one vIOMMU per domain, rather than adding the same check here.

It is fine to me. Will remove all checks against viommu numbers in
toolstack.

Thanks
chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V3 22/29] x86/vioapic: extend vioapic_get_vector() to support remapping format RTE

2017-10-19 Thread Chao Gao
On Thu, Oct 19, 2017 at 09:56:34AM -0600, Jan Beulich wrote:
 On 19.10.17 at 17:49,  wrote:
>> On Thu, Sep 21, 2017 at 11:02:03PM -0400, Lan Tianyu wrote:
>>> --- a/xen/arch/x86/hvm/vioapic.c
>>> +++ b/xen/arch/x86/hvm/vioapic.c
>>> @@ -561,11 +561,25 @@ int vioapic_get_vector(const struct domain *d, 
>>> unsigned int gsi)
>>>  {
>>>  unsigned int pin;
>>>  const struct hvm_vioapic *vioapic = gsi_vioapic(d, gsi, );
>>> +struct arch_irq_remapping_request request;
>>>  
>>>  if ( !vioapic )
>>>  return -EINVAL;
>>>  
>>> -return vioapic->redirtbl[pin].fields.vector;
>>> +irq_request_ioapic_fill(, vioapic->id, 
>>> vioapic->redirtbl[pin].bits);
>>> +if ( viommu_check_irq_remapping(vioapic->domain, ) )
>>> +{
>>> +int err;
>>> +struct arch_irq_remapping_info info;
>>> +
>>> +err = viommu_get_irq_info(vioapic->domain, , );
>>> +return !err ? info.vector : err;
>> 
>> You can simplify this as return err :? info.vector;
>
>At which point the local variable becomes pretty pointless.

Maybe we can remove 'err' and return
unlikely(viommu_get_irq_info(...)) ?: info.vector;

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2] x86/vpt: guarantee the return value of pt_update_irq() set in vIRR or PIR

2017-10-16 Thread Chao Gao
On Mon, Oct 16, 2017 at 08:26:09AM -0600, Jan Beulich wrote:
 On 16.10.17 at 15:13,  wrote:
>> On Mon, Oct 16, 2017 at 07:15:16AM -0600, Jan Beulich wrote:
>> On 13.10.17 at 07:10,  wrote:
 --- a/xen/arch/x86/hvm/irq.c
 +++ b/xen/arch/x86/hvm/irq.c
 @@ -168,11 +168,13 @@ void hvm_gsi_deassert(struct domain *d, unsigned int 
 gsi)
  spin_unlock(>arch.hvm_domain.irq_lock);
  }
  
 -void hvm_isa_irq_assert(
 -struct domain *d, unsigned int isa_irq)
 +int hvm_isa_irq_assert(struct domain *d, unsigned int isa_irq,
 +   int (*get_vector)(const struct domain *d,
 + unsigned int gsi))
  {
  struct hvm_irq *hvm_irq = hvm_domain_irq(d);
  unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq);
 +int vector = 0;
>>>
>>>Why zero (which is valid aiui) instead of e.g. -1?
>> 
>> vector also serves as the return value. I want to return 0 if no
>> callback is set.  And the callback, get_vector, can override the return
>> value. Do you think it is reasonable?
>
>Why "also" - being the return value is the only purpose of "vector".
>And as said - zero is a valid vector, and I wouldn't like to see the
>function return a valid but meaningless vector number.

But if no callback is set, would it be a little weird to return -1 which
always means failure? Considering no caller would be confused by the
return value (since except the caller introduced by this patch, no one
would check the return value), I don't insist on this.

>
 --- a/xen/include/asm-x86/hvm/vmx/vmx.h
 +++ b/xen/include/asm-x86/hvm/vmx/vmx.h
 @@ -109,6 +109,11 @@ static inline int pi_test_and_set_pir(int vector, 
 struct pi_desc *pi_desc)
  return test_and_set_bit(vector, pi_desc->pir);
  }
  
 +static inline int pi_test_pir(int vector, const struct pi_desc *pi_desc)
>>>
>>>This should not be a signed quantity - uint8_t or unsigned int
>>>please.
>> 
>> Yes.
>
>I.e. meaning you're fine with either variant, leaving it up to me
>which one to use?

Yes, both of them are ok to me.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2] x86/vpt: guarantee the return value of pt_update_irq() set in vIRR or PIR

2017-10-16 Thread Chao Gao
On Mon, Oct 16, 2017 at 07:15:16AM -0600, Jan Beulich wrote:
 On 13.10.17 at 07:10,  wrote:
>> --- a/xen/arch/x86/hvm/irq.c
>> +++ b/xen/arch/x86/hvm/irq.c
>> @@ -168,11 +168,13 @@ void hvm_gsi_deassert(struct domain *d, unsigned int 
>> gsi)
>>  spin_unlock(>arch.hvm_domain.irq_lock);
>>  }
>>  
>> -void hvm_isa_irq_assert(
>> -struct domain *d, unsigned int isa_irq)
>> +int hvm_isa_irq_assert(struct domain *d, unsigned int isa_irq,
>> +   int (*get_vector)(const struct domain *d,
>> + unsigned int gsi))
>>  {
>>  struct hvm_irq *hvm_irq = hvm_domain_irq(d);
>>  unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq);
>> +int vector = 0;
>
>Why zero (which is valid aiui) instead of e.g. -1?

vector also serves as the return value. I want to return 0 if no
callback is set.  And the callback, get_vector, can override the return
value. Do you think it is reasonable?

>
>> --- a/xen/include/asm-x86/hvm/vmx/vmx.h
>> +++ b/xen/include/asm-x86/hvm/vmx/vmx.h
>> @@ -109,6 +109,11 @@ static inline int pi_test_and_set_pir(int vector, 
>> struct pi_desc *pi_desc)
>>  return test_and_set_bit(vector, pi_desc->pir);
>>  }
>>  
>> +static inline int pi_test_pir(int vector, const struct pi_desc *pi_desc)
>
>This should not be a signed quantity - uint8_t or unsigned int
>please.

Yes.

>
>I wouldn't mind making suitable adjustments while committing (and
>then adding my R-b), but that requires your feedback which way
>things should be.

Sure. I will appreciate it.
>
>Also please don't forget to Cc the release manager, unless you
>intend this fix only for after 4.10.

Hi, Julien.

This patch is to fix a possible cause of an assertion failure related to
periodic timer interrupt. OSSTEST reports regression occasionally when the bug
happens. I intend to merge this patch at first and then observe whether
the bug disappears or not. Since Jan said he could do some adjustments to the
patch when committing, Could you give acked-by on this patch?

Thanks
Chao.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2] x86/vpt: guarantee the return value of pt_update_irq() set in vIRR or PIR

2017-10-13 Thread Chao Gao
pt_update_irq() is expected to return the vector number of periodic
timer interrupt, which should be set in vIRR of vlapic or in PIR.
Otherwise it would trigger the assertion in vmx_intr_assist(), please
seeing 
https://lists.xenproject.org/archives/html/xen-devel/2017-10/msg00915.html.

But it fails to achieve that in the following two case:
1. hvm_isa_irq_assert() may not set the corresponding bit in vIRR for
mask field of IOAPIC RTE is set. Please refer to the call tree
vmx_intr_assist() -> pt_update_irq() -> hvm_isa_irq_assert() ->
assert_irq() -> assert_gsi() -> vioapic_irq_positive_edge(). The patch
checks whether the vector is set or not in vIRR of vlapic or PIR before
returning.

2. someone changes the vector field of IOAPIC RTE between asserting
the irq and getting the vector of the irq, leading to setting the
old vector number but returning a different vector number. This patch
allows hvm_isa_irq_assert() to accept a callback which can get the
interrupt vector with irq_lock held. Thus, no one can change the vector
between the two operations.

Signed-off-by: Chao Gao <chao@intel.com>
---
passed the two simple xtf tests in 
https://lists.xenproject.org/archives/html/xen-devel/2017-10/msg00915.html.
, which are designed to produce the above two cases.

v2:
- add a callback to hvm_isa_irq_assert() to avoid code duplication
- Constify vlapic argument of vlapic_test_irq()
---
 xen/arch/x86/hvm/dm.c |  2 +-
 xen/arch/x86/hvm/irq.c| 11 +--
 xen/arch/x86/hvm/pmtimer.c|  2 +-
 xen/arch/x86/hvm/rtc.c|  2 +-
 xen/arch/x86/hvm/vlapic.c | 12 
 xen/arch/x86/hvm/vmx/vmx.c|  7 +++
 xen/arch/x86/hvm/vpt.c| 39 ++-
 xen/include/asm-x86/hvm/hvm.h |  1 +
 xen/include/asm-x86/hvm/irq.h | 12 ++--
 xen/include/asm-x86/hvm/vlapic.h  |  1 +
 xen/include/asm-x86/hvm/vmx/vmx.h |  5 +
 11 files changed, 74 insertions(+), 20 deletions(-)

diff --git a/xen/arch/x86/hvm/dm.c b/xen/arch/x86/hvm/dm.c
index 32ade95..a787f43 100644
--- a/xen/arch/x86/hvm/dm.c
+++ b/xen/arch/x86/hvm/dm.c
@@ -143,7 +143,7 @@ static int set_isa_irq_level(struct domain *d, uint8_t 
isa_irq,
 hvm_isa_irq_deassert(d, isa_irq);
 break;
 case 1:
-hvm_isa_irq_assert(d, isa_irq);
+hvm_isa_irq_assert(d, isa_irq, NULL);
 break;
 default:
 return -EINVAL;
diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c
index e425df9..d79a367 100644
--- a/xen/arch/x86/hvm/irq.c
+++ b/xen/arch/x86/hvm/irq.c
@@ -168,11 +168,13 @@ void hvm_gsi_deassert(struct domain *d, unsigned int gsi)
 spin_unlock(>arch.hvm_domain.irq_lock);
 }
 
-void hvm_isa_irq_assert(
-struct domain *d, unsigned int isa_irq)
+int hvm_isa_irq_assert(struct domain *d, unsigned int isa_irq,
+   int (*get_vector)(const struct domain *d,
+ unsigned int gsi))
 {
 struct hvm_irq *hvm_irq = hvm_domain_irq(d);
 unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq);
+int vector = 0;
 
 ASSERT(isa_irq <= 15);
 
@@ -182,7 +184,12 @@ void hvm_isa_irq_assert(
  (hvm_irq->gsi_assert_count[gsi]++ == 0) )
 assert_irq(d, gsi, isa_irq);
 
+if ( get_vector )
+vector = get_vector(d, gsi);
+
 spin_unlock(>arch.hvm_domain.irq_lock);
+
+return vector;
 }
 
 void hvm_isa_irq_deassert(
diff --git a/xen/arch/x86/hvm/pmtimer.c b/xen/arch/x86/hvm/pmtimer.c
index b70c299..435647f 100644
--- a/xen/arch/x86/hvm/pmtimer.c
+++ b/xen/arch/x86/hvm/pmtimer.c
@@ -61,7 +61,7 @@ static void pmt_update_sci(PMTState *s)
 ASSERT(spin_is_locked(>lock));
 
 if ( acpi->pm1a_en & acpi->pm1a_sts & SCI_MASK )
-hvm_isa_irq_assert(s->vcpu->domain, SCI_IRQ);
+hvm_isa_irq_assert(s->vcpu->domain, SCI_IRQ, NULL);
 else
 hvm_isa_irq_deassert(s->vcpu->domain, SCI_IRQ);
 }
diff --git a/xen/arch/x86/hvm/rtc.c b/xen/arch/x86/hvm/rtc.c
index bcfa169..cb75b99 100644
--- a/xen/arch/x86/hvm/rtc.c
+++ b/xen/arch/x86/hvm/rtc.c
@@ -75,7 +75,7 @@ static void rtc_update_irq(RTCState *s)
 s->hw.cmos_data[RTC_REG_C] |= RTC_IRQF;
 if ( rtc_mode_is(s, no_ack) )
 hvm_isa_irq_deassert(vrtc_domain(s), RTC_IRQ);
-hvm_isa_irq_assert(vrtc_domain(s), RTC_IRQ);
+hvm_isa_irq_assert(vrtc_domain(s), RTC_IRQ, NULL);
 }
 
 /* Called by the VPT code after it's injected a PF interrupt for us.
diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c
index 4bfc53e..50f53bd 100644
--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -137,6 +137,18 @@ static void vlapic_error(struct vlapic *vlapic, unsigned 
int errmask)
 spin_unlock_irqrestore(>esr_lock, flags);
 }
 
+bool vlapic_test_irq(const struct vlapic *vlapic, uint8_t vec)
+{
+if ( unlikely(vec < 16) )
+return false;
+
+if ( hvm_

Re: [Xen-devel] [PATCH] x86/vpt: fix a bug in pt_update_irq()

2017-10-13 Thread Chao Gao
On Fri, Oct 13, 2017 at 02:25:38AM -0600, Jan Beulich wrote:
>>>> On 09.10.17 at 23:32, <chao@intel.com> wrote:
>
>First of all - please use a better subject. If someone finds another
>bug in this function in, say, half a year's time, how will we tell apart
>the two patches from looking at just the list of titles several years
>later?

Will update.

>
>> pt_update_irq() is expected to return the vector number of periodic
>> timer interrupt, which should be set in vIRR of vlapic. Otherwise it
>> would trigger the assertion in vmx_intr_assist(), please seeing
>> https://lists.xenproject.org/archives/html/xen-devel/2017-10/msg00915.html.
>> 
>> But it fails to achieve that in the following two case:
>> 1. hvm_isa_irq_assert() may not set the corresponding bit in vIRR for
>> mask field of IOAPIC RTE is set. Please refer to the call tree
>> vmx_intr_assist() -> pt_update_irq() -> hvm_isa_irq_assert() ->
>> assert_irq() -> assert_gsi() -> vioapic_irq_positive_edge(). The patch
>> checks whether the vector is set or not in vIRR of vlapic before
>> returning.
>> 
>> 2. someone changes the vector field of IOAPIC RTE between asserting
>> the irq and getting the vector of the irq, leading to setting the
>> old vector number but returning a different vector number. This patch
>> holds the irq_lock when doing the two operations to prevent the case.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>
>Point 2 is very unlikely to be the cause of the failed assertion that
>osstest keeps hitting once in a while. Did your analysis yield
>indication that point 1 is what is happening there?

I believe it is likely to be the case. On the other hand, the
assertion can be triggered in above two cases; it needs to be
fixed.

>
>> --- a/xen/arch/x86/hvm/irq.c
>> +++ b/xen/arch/x86/hvm/irq.c
>> @@ -168,20 +168,23 @@ void hvm_gsi_deassert(struct domain *d, unsigned int 
>> gsi)
>>  spin_unlock(>arch.hvm_domain.irq_lock);
>>  }
>>  
>> -void hvm_isa_irq_assert(
>> -struct domain *d, unsigned int isa_irq)
>> +void hvm_isa_irq_assert_locked(struct domain *d, unsigned int isa_irq)
>
>Please don't introduce a non-static function like this. Instead I
>would suggest you introduce a new helper function doing what
>you introduce as replacement to the call to
>hvm_isa_irq_assert(). That'll presumably involve passing a
>get_vector() callback to a wrapper of pt_irq_vector() (or to an
>abbreviated form of it, as "src" is hvm_intsrc_lapic), since I
>understand you need this called with the lock held.
>
>And once you do this I don't think it'll be worthwhile breaking
>out hvm_isa_irq_assert_locked() at all - you'll just have a
>sibling to hvm_isa_irq_assert(). Or, considering the few callers
>the function has, simply giving that function itself an optional
>callback parameter might be even better (eliminating any code
>duplication).

Ok. I understand what you suggestion. Will give it a shot.

>
>> --- a/xen/arch/x86/hvm/vlapic.c
>> +++ b/xen/arch/x86/hvm/vlapic.c
>> @@ -137,6 +137,17 @@ static void vlapic_error(struct vlapic *vlapic, 
>> unsigned int errmask)
>>  spin_unlock_irqrestore(>esr_lock, flags);
>>  }
>>  
>> +bool vlapic_test_irq(struct vlapic *vlapic, uint8_t vec)
>
>The way the function is named, the pointer should be const
>qualified. However, the function does more than just testing
>current state:
>
>> +{
>> +if ( unlikely(vec < 16) )
>> +return false;
>> +
>> +if ( hvm_funcs.sync_pir_to_irr )
>> +hvm_funcs.sync_pir_to_irr(vlapic_vcpu(vlapic));
>
>Question is whether this is really necessary, of whether instead
>you could just return the state of the respective PIR bit here. I'd
>prefer that over giving the function a name no longer suggesting
>it leaves all state alone.

It is a good suggestion. But I incline to check the PIR bit and if the
bit is set, return true and otherwise return the state of the vIRR bit
in case PIR bits are already synced to vIRR.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH] x86/vpt: fix a bug in pt_update_irq()

2017-10-09 Thread Chao Gao
pt_update_irq() is expected to return the vector number of periodic
timer interrupt, which should be set in vIRR of vlapic. Otherwise it
would trigger the assertion in vmx_intr_assist(), please seeing
https://lists.xenproject.org/archives/html/xen-devel/2017-10/msg00915.html.

But it fails to achieve that in the following two case:
1. hvm_isa_irq_assert() may not set the corresponding bit in vIRR for
mask field of IOAPIC RTE is set. Please refer to the call tree
vmx_intr_assist() -> pt_update_irq() -> hvm_isa_irq_assert() ->
assert_irq() -> assert_gsi() -> vioapic_irq_positive_edge(). The patch
checks whether the vector is set or not in vIRR of vlapic before
returning.

2. someone changes the vector field of IOAPIC RTE between asserting
the irq and getting the vector of the irq, leading to setting the
old vector number but returning a different vector number. This patch
holds the irq_lock when doing the two operations to prevent the case.

Signed-off-by: Chao Gao <chao@intel.com>
---
 xen/arch/x86/hvm/irq.c   | 11 ++
 xen/arch/x86/hvm/vlapic.c| 11 ++
 xen/arch/x86/hvm/vpt.c   | 43 
 xen/include/asm-x86/hvm/irq.h|  1 +
 xen/include/asm-x86/hvm/vlapic.h |  1 +
 5 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c
index e425df9..7b0c0b1 100644
--- a/xen/arch/x86/hvm/irq.c
+++ b/xen/arch/x86/hvm/irq.c
@@ -168,20 +168,23 @@ void hvm_gsi_deassert(struct domain *d, unsigned int gsi)
 spin_unlock(>arch.hvm_domain.irq_lock);
 }
 
-void hvm_isa_irq_assert(
-struct domain *d, unsigned int isa_irq)
+void hvm_isa_irq_assert_locked(struct domain *d, unsigned int isa_irq)
 {
 struct hvm_irq *hvm_irq = hvm_domain_irq(d);
 unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq);
 
 ASSERT(isa_irq <= 15);
-
-spin_lock(>arch.hvm_domain.irq_lock);
+ASSERT(spin_is_locked(>arch.hvm_domain.irq_lock));
 
 if ( !__test_and_set_bit(isa_irq, _irq->isa_irq.i) &&
  (hvm_irq->gsi_assert_count[gsi]++ == 0) )
 assert_irq(d, gsi, isa_irq);
+}
 
+void hvm_isa_irq_assert(struct domain *d, unsigned int isa_irq)
+{
+spin_lock(>arch.hvm_domain.irq_lock);
+hvm_isa_irq_assert_locked(d, isa_irq);
 spin_unlock(>arch.hvm_domain.irq_lock);
 }
 
diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c
index 4bfc53e..b27b15b 100644
--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -137,6 +137,17 @@ static void vlapic_error(struct vlapic *vlapic, unsigned 
int errmask)
 spin_unlock_irqrestore(>esr_lock, flags);
 }
 
+bool vlapic_test_irq(struct vlapic *vlapic, uint8_t vec)
+{
+if ( unlikely(vec < 16) )
+return false;
+
+if ( hvm_funcs.sync_pir_to_irr )
+hvm_funcs.sync_pir_to_irr(vlapic_vcpu(vlapic));
+
+return vlapic_test_vector(vec, >regs->data[APIC_IRR]);
+}
+
 void vlapic_set_irq(struct vlapic *vlapic, uint8_t vec, uint8_t trig)
 {
 struct vcpu *target = vlapic_vcpu(vlapic);
diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c
index 3841140..f4451fd 100644
--- a/xen/arch/x86/hvm/vpt.c
+++ b/xen/arch/x86/hvm/vpt.c
@@ -252,7 +252,7 @@ int pt_update_irq(struct vcpu *v)
 struct list_head *head = >arch.hvm_vcpu.tm_list;
 struct periodic_time *pt, *temp, *earliest_pt;
 uint64_t max_lag;
-int irq, is_lapic;
+int irq, is_lapic, pt_vector;
 
 spin_lock(>arch.hvm_vcpu.tm_lock);
 
@@ -292,25 +292,42 @@ int pt_update_irq(struct vcpu *v)
 
 spin_unlock(>arch.hvm_vcpu.tm_lock);
 
+/*
+ * If periodic timer interrut is handled by lapic, its vector in
+ * IRR is returned and used to set eoi_exit_bitmap for virtual
+ * interrupt delivery case. Otherwise return -1 to do nothing.
+ */
 if ( is_lapic )
+{
 vlapic_set_irq(vcpu_vlapic(v), irq, 0);
+pt_vector = irq;
+}
 else
 {
 hvm_isa_irq_deassert(v->domain, irq);
-hvm_isa_irq_assert(v->domain, irq);
+/*
+ * Hold 'irq_lock' to prevent changing the interrupt vector between
+ * asserting the irq and getting the interrupt vector of the irq.
+ */
+spin_lock(>domain->arch.hvm_domain.irq_lock);
+hvm_isa_irq_assert_locked(v->domain, irq);
+if ( platform_legacy_irq(irq) && vlapic_accept_pic_intr(v) &&
+ v->domain->arch.hvm_domain.vpic[irq >> 3].int_output )
+pt_vector = -1;
+else
+{
+pt_vector = pt_irq_vector(earliest_pt, hvm_intsrc_lapic);
+/*
+ * hvm_isa_irq_assert_locked() may not set the corresponding bit
+ * in vIRR when mask field of IOAPIC RTE is set. Check it again.
+ */
+if ( !vlapic_test_irq(vcpu_vlapic(v), pt_vector) )
+pt_vector = -1;
+}
+sp

Re: [Xen-devel] [xen-unstable test] 113959: regressions - FAIL

2017-10-09 Thread Chao Gao
On Mon, Oct 09, 2017 at 12:03:53PM +0100, Andrew Cooper wrote:
>On 09/10/17 08:58, Chao Gao wrote:
>> On Mon, Oct 09, 2017 at 02:13:22PM +0800, Chao Gao wrote:
>>> On Tue, Oct 03, 2017 at 11:08:01AM +0100, Roger Pau Monné wrote:
>>>> On Tue, Oct 03, 2017 at 09:55:44AM +, osstest service owner wrote:
>>>>> flight 113959 xen-unstable real [real]
>>>>> http://logs.test-lab.xenproject.org/osstest/logs/113959/
>>>>>
>>>>> Regressions :-(
>>>>>
>>>>> Tests which did not succeed and are blocking,
>>>>> including tests which could not be run:
>>>>>  test-amd64-i386-libvirt-xsm  21 leak-check/check fail REGR. vs. 
>>>>> 113954
>>>> This is due to cron running when the leak-check is executed.
>>>>
>>>>>  test-armhf-armhf-xl-multivcpu  5 host-ping-check-native  fail REGR. vs. 
>>>>> 113954
>>>>>  test-amd64-i386-xl-qemut-debianhvm-amd64 17 guest-stop   fail REGR. vs. 
>>>>> 113954
>>>> The test below has triggered the following ASSERT, CCing the Intel
>>>> guys.
>>>>
>>>> Oct  3 06:12:00.415168 (XEN) d15v0: intack: 2:30 pt: 38
>>>> Oct  3 06:12:19.191141 (XEN) vIRR:     
>>>>   0001 
>>>> Oct  3 06:12:19.199162 (XEN)  PIR:     
>>>>    
>>>> Oct  3 06:12:19.207160 (XEN) Assertion 'intack.vector >= pt_vector' failed 
>>>> at intr.c:367
>>>> Oct  3 06:12:19.215215 (XEN) [ Xen-4.10-unstable  x86_64  debug=y   
>>>> Not tainted ]
>>>> Oct  3 06:12:19.223124 (XEN) CPU:1
>>>> Oct  3 06:12:19.223153 (XEN) RIP:e008:[] 
>>>> vmx_intr_assist+0x617/0x637
>>>> Oct  3 06:12:19.231185 (XEN) RFLAGS: 00010292   CONTEXT: 
>>>> hypervisor (d15v0)
>>>> Oct  3 06:12:19.239163 (XEN) rax: 83022dfc802c   rbx: 8300ccc65680 
>>>>   rcx: 
>>>> Oct  3 06:12:19.247169 (XEN) rdx: 83022df7   rsi: 000a 
>>>>   rdi: 82d0804606d8
>>>> Oct  3 06:12:19.255127 (XEN) rbp: 83022df7ff08   rsp: 83022df7fea8 
>>>>   r8:  83022df9
>>>> Oct  3 06:12:19.263114 (XEN) r9:  0001   r10:  
>>>>   r11: 0001
>>>> Oct  3 06:12:19.271109 (XEN) r12:    r13: 82d0803cfba6 
>>>>   r14: 82d0803cfba6
>>>> Oct  3 06:12:19.279119 (XEN) r15: 0004   cr0: 80050033 
>>>>   cr4: 001526e0
>>>> Oct  3 06:12:19.279157 (XEN) cr3: 000214274000   cr2: 5622a2184dbf
>>>> Oct  3 06:12:19.287123 (XEN) ds:    es:    fs:    gs:    
>>>> ss:    cs: e008
>>>> Oct  3 06:12:19.295105 (XEN) Xen code around  
>>>> (vmx_intr_assist+0x617/0x637):
>>>> Oct  3 06:12:19.303150 (XEN)  41 bf 00 00 00 00 eb a0 <0f> 0b 89 ce 48 89 
>>>> df e8 bb 20 00 00 e9 49 fe ff
>>>> Oct  3 06:12:19.32 (XEN) Xen stack trace from rsp=83022df7fea8:
>>>> Oct  3 06:12:19.311146 (XEN)83022df7ff08 00388030cf76 
>>>> 82d0805a7570 82d08057ad80
>>>> Oct  3 06:12:19.319131 (XEN)83022df7 83022df7fee0 
>>>> 82d08023b9b6 8300ccc65000
>>>> Oct  3 06:12:19.327115 (XEN)000b 0020 
>>>> 00c2 0004
>>>> Oct  3 06:12:19.345094 (XEN)880029eb4000 82d080311c21 
>>>> 0004 00c2
>>>> Oct  3 06:12:19.345177 (XEN)0020 000b 
>>>> 880029eb4000 81adf0a0
>>>> Oct  3 06:12:19.351221 (XEN)  
>>>> 88002d48 
>>>> Oct  3 06:12:19.359439 (XEN)0030  
>>>> 03f8 03f8
>>>> Oct  3 06:12:19.367267 (XEN)81adf0a0 beefbeef 
>>>> 8138a5f4 00bfbeef
>>>> Oct  3 06:12:19.375222 (XEN)0002 88002f803e08 
>>>> beef beef
>>>> Oct  3 06:12:19.383198 (XEN)beef beef 
>>>> beef 0001
>>>> Oct  3 06:12:19.391230 (XEN)8300ccc65000 0031ada20d00 
>>>> 001526e0

Re: [Xen-devel] [xen-unstable test] 113959: regressions - FAIL

2017-10-09 Thread Chao Gao
On Mon, Oct 09, 2017 at 02:13:22PM +0800, Chao Gao wrote:
>On Tue, Oct 03, 2017 at 11:08:01AM +0100, Roger Pau Monné wrote:
>>On Tue, Oct 03, 2017 at 09:55:44AM +, osstest service owner wrote:
>>> flight 113959 xen-unstable real [real]
>>> http://logs.test-lab.xenproject.org/osstest/logs/113959/
>>> 
>>> Regressions :-(
>>> 
>>> Tests which did not succeed and are blocking,
>>> including tests which could not be run:
>>>  test-amd64-i386-libvirt-xsm  21 leak-check/check fail REGR. vs. 
>>> 113954
>>
>>This is due to cron running when the leak-check is executed.
>>
>>>  test-armhf-armhf-xl-multivcpu  5 host-ping-check-native  fail REGR. vs. 
>>> 113954
>>>  test-amd64-i386-xl-qemut-debianhvm-amd64 17 guest-stop   fail REGR. vs. 
>>> 113954
>>
>>The test below has triggered the following ASSERT, CCing the Intel
>>guys.
>>
>>Oct  3 06:12:00.415168 (XEN) d15v0: intack: 2:30 pt: 38
>>Oct  3 06:12:19.191141 (XEN) vIRR:     
>>  0001 
>>Oct  3 06:12:19.199162 (XEN)  PIR:     
>>   
>>Oct  3 06:12:19.207160 (XEN) Assertion 'intack.vector >= pt_vector' failed at 
>>intr.c:367
>>Oct  3 06:12:19.215215 (XEN) [ Xen-4.10-unstable  x86_64  debug=y   Not 
>>tainted ]
>>Oct  3 06:12:19.223124 (XEN) CPU:1
>>Oct  3 06:12:19.223153 (XEN) RIP:e008:[] 
>>vmx_intr_assist+0x617/0x637
>>Oct  3 06:12:19.231185 (XEN) RFLAGS: 00010292   CONTEXT: hypervisor 
>>(d15v0)
>>Oct  3 06:12:19.239163 (XEN) rax: 83022dfc802c   rbx: 8300ccc65680   
>>rcx: 
>>Oct  3 06:12:19.247169 (XEN) rdx: 83022df7   rsi: 000a   
>>rdi: 82d0804606d8
>>Oct  3 06:12:19.255127 (XEN) rbp: 83022df7ff08   rsp: 83022df7fea8   
>>r8:  83022df9
>>Oct  3 06:12:19.263114 (XEN) r9:  0001   r10:    
>>r11: 0001
>>Oct  3 06:12:19.271109 (XEN) r12:    r13: 82d0803cfba6   
>>r14: 82d0803cfba6
>>Oct  3 06:12:19.279119 (XEN) r15: 0004   cr0: 80050033   
>>cr4: 001526e0
>>Oct  3 06:12:19.279157 (XEN) cr3: 000214274000   cr2: 5622a2184dbf
>>Oct  3 06:12:19.287123 (XEN) ds:    es:    fs:    gs:    ss: 
>>   cs: e008
>>Oct  3 06:12:19.295105 (XEN) Xen code around  
>>(vmx_intr_assist+0x617/0x637):
>>Oct  3 06:12:19.303150 (XEN)  41 bf 00 00 00 00 eb a0 <0f> 0b 89 ce 48 89 df 
>>e8 bb 20 00 00 e9 49 fe ff
>>Oct  3 06:12:19.32 (XEN) Xen stack trace from rsp=83022df7fea8:
>>Oct  3 06:12:19.311146 (XEN)83022df7ff08 00388030cf76 
>>82d0805a7570 82d08057ad80
>>Oct  3 06:12:19.319131 (XEN)83022df7 83022df7fee0 
>>82d08023b9b6 8300ccc65000
>>Oct  3 06:12:19.327115 (XEN)000b 0020 
>>00c2 0004
>>Oct  3 06:12:19.345094 (XEN)880029eb4000 82d080311c21 
>>0004 00c2
>>Oct  3 06:12:19.345177 (XEN)0020 000b 
>>880029eb4000 81adf0a0
>>Oct  3 06:12:19.351221 (XEN)  
>>88002d48 
>>Oct  3 06:12:19.359439 (XEN)0030  
>>03f8 03f8
>>Oct  3 06:12:19.367267 (XEN)81adf0a0 beefbeef 
>>8138a5f4 00bfbeef
>>Oct  3 06:12:19.375222 (XEN)0002 88002f803e08 
>>beef beef
>>Oct  3 06:12:19.383198 (XEN)beef beef 
>>beef 0001
>>Oct  3 06:12:19.391230 (XEN)8300ccc65000 0031ada20d00 
>>001526e0
>>Oct  3 06:12:19.399336 (XEN) Xen call trace:
>>Oct  3 06:12:19.399389 (XEN)[] 
>>vmx_intr_assist+0x617/0x637
>>Oct  3 06:12:19.407337 (XEN)[] 
>>vmx_asm_vmexit_handler+0x41/0x120
>>Oct  3 06:12:19.407380 (XEN) 
>>Oct  3 06:12:19.415246 (XEN) 
>>Oct  3 06:12:19.415278 (XEN) 
>>Oct  3 06:12:19.415307 (XEN) Panic on CPU 1:
>>Oct  3 06:12:19.415332 (XEN) Assertion 'intack.vector >= pt_vector' failed at 
>>intr.c:367
>>Oct  3 06:12:19.423432 (XEN) 
>
>(CC Jan)
>
>Hi, Roger.
>
>I sent a patch to fix a possible cause of this bug, seeing
>https://lists.xenproject.org/archive

Re: [Xen-devel] [xen-unstable test] 113959: regressions - FAIL

2017-10-09 Thread Chao Gao
On Tue, Oct 03, 2017 at 11:08:01AM +0100, Roger Pau Monné wrote:
>On Tue, Oct 03, 2017 at 09:55:44AM +, osstest service owner wrote:
>> flight 113959 xen-unstable real [real]
>> http://logs.test-lab.xenproject.org/osstest/logs/113959/
>> 
>> Regressions :-(
>> 
>> Tests which did not succeed and are blocking,
>> including tests which could not be run:
>>  test-amd64-i386-libvirt-xsm  21 leak-check/check fail REGR. vs. 
>> 113954
>
>This is due to cron running when the leak-check is executed.
>
>>  test-armhf-armhf-xl-multivcpu  5 host-ping-check-native  fail REGR. vs. 
>> 113954
>>  test-amd64-i386-xl-qemut-debianhvm-amd64 17 guest-stop   fail REGR. vs. 
>> 113954
>
>The test below has triggered the following ASSERT, CCing the Intel
>guys.
>
>Oct  3 06:12:00.415168 (XEN) d15v0: intack: 2:30 pt: 38
>Oct  3 06:12:19.191141 (XEN) vIRR:     
>  0001 
>Oct  3 06:12:19.199162 (XEN)  PIR:     
>   
>Oct  3 06:12:19.207160 (XEN) Assertion 'intack.vector >= pt_vector' failed at 
>intr.c:367
>Oct  3 06:12:19.215215 (XEN) [ Xen-4.10-unstable  x86_64  debug=y   Not 
>tainted ]
>Oct  3 06:12:19.223124 (XEN) CPU:1
>Oct  3 06:12:19.223153 (XEN) RIP:e008:[] 
>vmx_intr_assist+0x617/0x637
>Oct  3 06:12:19.231185 (XEN) RFLAGS: 00010292   CONTEXT: hypervisor 
>(d15v0)
>Oct  3 06:12:19.239163 (XEN) rax: 83022dfc802c   rbx: 8300ccc65680   
>rcx: 
>Oct  3 06:12:19.247169 (XEN) rdx: 83022df7   rsi: 000a   
>rdi: 82d0804606d8
>Oct  3 06:12:19.255127 (XEN) rbp: 83022df7ff08   rsp: 83022df7fea8   
>r8:  83022df9
>Oct  3 06:12:19.263114 (XEN) r9:  0001   r10:    
>r11: 0001
>Oct  3 06:12:19.271109 (XEN) r12:    r13: 82d0803cfba6   
>r14: 82d0803cfba6
>Oct  3 06:12:19.279119 (XEN) r15: 0004   cr0: 80050033   
>cr4: 001526e0
>Oct  3 06:12:19.279157 (XEN) cr3: 000214274000   cr2: 5622a2184dbf
>Oct  3 06:12:19.287123 (XEN) ds:    es:    fs:    gs:    ss: 
>   cs: e008
>Oct  3 06:12:19.295105 (XEN) Xen code around  
>(vmx_intr_assist+0x617/0x637):
>Oct  3 06:12:19.303150 (XEN)  41 bf 00 00 00 00 eb a0 <0f> 0b 89 ce 48 89 df 
>e8 bb 20 00 00 e9 49 fe ff
>Oct  3 06:12:19.32 (XEN) Xen stack trace from rsp=83022df7fea8:
>Oct  3 06:12:19.311146 (XEN)83022df7ff08 00388030cf76 
>82d0805a7570 82d08057ad80
>Oct  3 06:12:19.319131 (XEN)83022df7 83022df7fee0 
>82d08023b9b6 8300ccc65000
>Oct  3 06:12:19.327115 (XEN)000b 0020 
>00c2 0004
>Oct  3 06:12:19.345094 (XEN)880029eb4000 82d080311c21 
>0004 00c2
>Oct  3 06:12:19.345177 (XEN)0020 000b 
>880029eb4000 81adf0a0
>Oct  3 06:12:19.351221 (XEN)  
>88002d48 
>Oct  3 06:12:19.359439 (XEN)0030  
>03f8 03f8
>Oct  3 06:12:19.367267 (XEN)81adf0a0 beefbeef 
>8138a5f4 00bfbeef
>Oct  3 06:12:19.375222 (XEN)0002 88002f803e08 
>beef beef
>Oct  3 06:12:19.383198 (XEN)beef beef 
>beef 0001
>Oct  3 06:12:19.391230 (XEN)8300ccc65000 0031ada20d00 
>001526e0
>Oct  3 06:12:19.399336 (XEN) Xen call trace:
>Oct  3 06:12:19.399389 (XEN)[] 
>vmx_intr_assist+0x617/0x637
>Oct  3 06:12:19.407337 (XEN)[] 
>vmx_asm_vmexit_handler+0x41/0x120
>Oct  3 06:12:19.407380 (XEN) 
>Oct  3 06:12:19.415246 (XEN) 
>Oct  3 06:12:19.415278 (XEN) 
>Oct  3 06:12:19.415307 (XEN) Panic on CPU 1:
>Oct  3 06:12:19.415332 (XEN) Assertion 'intack.vector >= pt_vector' failed at 
>intr.c:367
>Oct  3 06:12:19.423432 (XEN) 

(CC Jan)

Hi, Roger.

I sent a patch to fix a possible cause of this bug, seeing
https://lists.xenproject.org/archives/html/xen-devel/2017-04/msg03254.html.

Due to Xen 4.9 release, I put this patch aside and later forgot to
continue fixing this bug. Sorry for this. Of course, I will fix this
bug.

I thought the root case was:
When injecting periodic timer interrupt in vmx_intr_assist(),
multi-read operations are done during one event delivery. For
example, if a periodic timer interrupt is from PIT, when set the
corresponding bit in vIRR, the corresponding RTE is accessed in
pt_update_irq(). When this function returns, it accesses the RTE
again to get the vector it sets in vIRR.  Between the two
accesses, the content of RTE may have been changed by another CPU
for no protection method in use. This case can incur the
assertion failure in vmx_intr_assist().

For example, 

Re: [Xen-devel] How to create a PVHv2 guest

2017-09-11 Thread Chao Gao
On Mon, Sep 11, 2017 at 10:14:15AM +0100, Roger Pau Monné wrote:
>On Mon, Sep 11, 2017 at 09:58:01AM +0800, Chao Gao wrote:
>> Hi, Roger.
>> 
>> I meet an error when creating a pvh guest. I am using commit 6e2a4c73564a. 
>> From the error log, I found bootlate_pv()->pin_table always failed. And the
>> failure was caused by is_pv_domain(pg_owner) in do_mmuext_op(). Do you have
>> any idea on this?
>
>IIRC bootlate_pv should never be called for a PVH guest. I'm not
>really sure how you can get there, can you paste/attach your Linux
>kernel config file?

I really got there. I don't set XEN_PVH in config file. After setting
XEN_PVH, the error disappears. From the output of xc_dom_compat_check(),
I guess the guest was wrongly treated as a pv guest.

>
>Also I'm not sure using a qcow2 disk is going to work properly, the
>current PVHv2 tools implementation will not spawn a QEMU instance to
>act as the backend.

Ok. I will try to use a raw image.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] How to create a PVHv2 guest

2017-09-10 Thread Chao Gao
Hi, Roger.

I meet an error when creating a pvh guest. I am using commit 6e2a4c73564a. 
From the error log, I found bootlate_pv()->pin_table always failed. And the
failure was caused by is_pv_domain(pg_owner) in do_mmuext_op(). Do you have
any idea on this?

The guest config file is:
builder = "hvm"
name = "vVTD_ASS_02_1467338450"
memory=3300
vcpus=4
disk = [ '/home/gao/performance_tuning/env/xen_centos2.qcow2,qcow2,hda,rw' ]
boot='c'
device_model_version='none'
kernel="/home/gao/hvmlite/vmlinuz-4.13.0-rc1+"
ramdisk="/home/gao/hvmlite/initramfs-4.13.0-rc1+.img"
cmdline="root=/dev/mapper/centos-root ro crashkernel=auto rd.lvm.lv=centos/root 
rd.lvm.lv=centos/swap rhgb console=tty0 console=ttyS0,115200 x2apic_phys 
xen_nopv LANG=en_US.UTF-8"


The error log is:
Parsing config from /home/gao/Downloads/config.vmxVTD_ASS_02
{
"c_info": {
"type": "hvm",
"name": "vVTD_ASS_02_1467338450",
"uuid": "721df691-dae5-401d-94a3-cffacb67e6b5",
"run_hotplug_scripts": "True"
},
"b_info": {
"max_vcpus": 4,
"avail_vcpus": [
0,
1,
2,
3
],
"max_memkb": 3379200,
"target_memkb": 3379200,
"shadow_memkb": 30496,
"device_model_version": "none",
"sched_params": {

},
"claim_mode": "True",
"kernel": "/home/gao/hvmlite/vmlinuz-4.13.0-rc1+",
"cmdline": "root=/dev/mapper/centos-root ro crashkernel=auto 
rd.lvm.lv=centos/root rd.lvm.lv=centos/swap rhgb console=tty0 
console=ttyS0,115200 x2apic_phys xen_nopv LANG=en_US.UTF-8",
"ramdisk": "/home/gao/hvmlite/initramfs-4.13.0-rc1+.img",
"type.hvm": {
"vga": {

},
"vnc": {

},
"sdl": {

},
"spice": {

},
"boot": "c",
"rdm": {

}
},
"arch_arm": {

}
},
"disks": [
{
"pdev_path": "/home/gao/performance_tuning/env/xen_centos2.qcow2",
"vdev": "hda",
"format": "qcow2",
"readwrite": 1
}
],
"on_reboot": "restart",
"on_soft_reset": "soft_reset"
}
libxl: debug: libxl_create.c:1609:do_domain_create: Domain 0:ao 0x7f4cc0: 
create: how=(nil) callback=(nil) poller=0x7f4ac0
libxl: debug: libxl_device.c:361:libxl__device_disk_set_backend: Disk vdev=hda 
spec.backend=unknown
libxl: debug: libxl_device.c:324:disk_try_backend: Disk vdev=hda, backend phy 
unsuitable due to format qcow2
libxl: debug: libxl_device.c:396:libxl__device_disk_set_backend: Disk vdev=hda, 
using backend qdisk
libxl: debug: libxl_create.c:965:initiate_domain_create: Domain 19:running 
bootloader
libxl: debug: libxl_bootloader.c:328:libxl__bootloader_run: Domain 19:not a PV 
domain, skipping bootloader
libxl: debug: libxl_event.c:686:libxl__ev_xswatch_deregister: watch w=0x7f7628: 
deregister unregistered
libxl: debug: libxl_numa.c:502:libxl__get_numa_candidate: New best NUMA 
placement candidate found: nr_nodes=1, nr_cpus=272, nr_vcpus=132, 
free_memkb=87413
libxl: detail: libxl_dom.c:182:numa_place_domain: NUMA placement candidate with 
1 nodes, 272 cpus and 87413 KB free selected
domainbuilder: detail: xc_dom_allocate: cmdline="root=/dev/mapper/centos-root 
ro crashkernel=auto rd.lvm.lv=centos/root rd.lvm.lv=centos/swap rhgb 
console=tty0 console=ttyS0,115200 x2apic_phys xen_nopv LANG=en_US.UTF-8", 
features=""
domainbuilder: detail: xc_dom_kernel_file: 
filename="/home/gao/hvmlite/vmlinuz-4.13.0-rc1+"
domainbuilder: detail: xc_dom_malloc_filemap: 6130 kB
domainbuilder: detail: xc_dom_ramdisk_file: 
filename="/home/gao/hvmlite/initramfs-4.13.0-rc1+.img"
domainbuilder: detail: xc_dom_malloc_filemap: 50 MB
domainbuilder: detail: xc_dom_boot_xen_init: ver 4.10, caps xen-3.0-x86_64 
xen-3.0-x86_32p hvm-3.0-x86_32 hvm-3.0-x86_32p hvm-3.0-x86_64 
domainbuilder: detail: xc_dom_parse_image: called
domainbuilder: detail: xc_dom_find_loader: trying multiboot-binary loader ... 
domainbuilder: detail: loader probe failed
domainbuilder: detail: xc_dom_find_loader: trying HVM-generic loader ... 
domainbuilder: detail: loader probe failed
domainbuilder: detail: xc_dom_find_loader: trying Linux bzImage loader ... 
domainbuilder: detail: xc_dom_malloc: 20922 kB
domainbuilder: detail: xc_dom_do_gunzip: unzip ok, 0x5eee00 -> 0x146e9b8
domainbuilder: detail: loader probe OK
xc: detail: ELF: phdr: paddr=0x100 memsz=0xb5c000
xc: detail: ELF: phdr: paddr=0x1c0 memsz=0x235000
xc: detail: ELF: phdr: paddr=0x1e35000 memsz=0x1ce58
xc: detail: ELF: phdr: paddr=0x1e52000 memsz=0x52a000
xc: detail: ELF: memory: 0x100 -> 0x237c000
xc: detail: ELF: note: GUEST_OS = "linux"
xc: detail: ELF: note: GUEST_VERSION = "2.6"
xc: detail: ELF: note: XEN_VERSION = "xen-3.0"
xc: detail: ELF: note: VIRT_BASE = 0x8000
xc: detail: ELF: note: INIT_P2M = 0x80
xc: detail: ELF: note: ENTRY = 

Re: [Xen-devel] [PATCH v2 1/4] x86/dom0: prevent access to MMCFG areas for PVH Dom0

2017-09-04 Thread Chao Gao
On Mon, Sep 04, 2017 at 10:26:04AM +0100, Roger Pau Monné wrote:
>(Adding Chao again because my MUA seems to drop him each time)
>
>On Mon, Sep 04, 2017 at 10:00:00AM +0100, Roger Pau Monné wrote:
>> On Mon, Sep 04, 2017 at 02:25:10PM +0800, Chao Gao wrote:
>> > On Thu, Aug 31, 2017 at 11:09:48AM +0100, Roger Pau Monne wrote:
>> > >I tested Nehalem, Sandy Bridge and Haswell, but sadly not Ivy Bridge
>> > >(in fact I didn't even know about Ivy Bridge, that's why I said all
>> > >pre-Haswell).
>> > >
>> > >In fact I'm now trying with a Nehalem processor that seem to work, so
>> > >whatever this issue is it certainly doesn't affect all models or
>> > >chipsets.
>> > 
>> > Hi, Roger.
>> > 
>> > Last week, I borrowed a Sandy Bridge with Intel(R) Xeon(R) E5-2690
>> > 2.7GHz and tested with 'dom0=pvh'. But I didn't see the machine hang.
>> > 
>> > I also tested on Haswell and found RMRRs in dmar are incorrect on my
>> > haswell. The e820 on that machine is:
>> > (XEN) [0.00] Xen-e820 RAM map:
>> > (XEN) [0.00]   - 0009a400 (usable)
>> > (XEN) [0.00]  0009a400 - 000a (reserved)
>> > (XEN) [0.00]  000e - 0010 (reserved)
>> > (XEN) [0.00]  0010 - 6ff84000 (usable)
>> > (XEN) [0.00]  6ff84000 - 7ac51000 (reserved)
>> > (XEN) [0.00]  7ac51000 - 7b681000 (ACPI NVS)
>> > (XEN) [0.00]  7b681000 - 7b7cf000 (ACPI data)
>> > (XEN) [0.00]  7b7cf000 - 7b80 (usable)
>> > (XEN) [0.00]  7b80 - 9000 (reserved)
>> > (XEN) [0.00]  fed1c000 - fed2 (reserved)
>> > (XEN) [0.00]  ff40 - 0001 (reserved)
>> > (XEN) [0.00]  0001 - 00208000 (usable)
>> > 
>> > And the RMRRs in DMAR are:
>> > (XEN) [0.00] [VT-D]found ACPI_DMAR_RMRR:
>> > (XEN) [0.00] [VT-D] endpoint: :05:00.0
>> > (XEN) [0.00] [VT-D]dmar.c:638:   RMRR region: base_addr 723b4000
>> > end_addr 7a3f3fff
>> > (XEN) [0.00] [VT-D]found ACPI_DMAR_RMRR:
>> > (XEN) [0.00] [VT-D] endpoint: :00:1d.0
>> > (XEN) [0.00] [VT-D] endpoint: :00:1a.0
>> > (XEN) [0.00] [VT-D]dmar.c:638:   RMRR region: base_addr 723ac000
>> > end_addr 723aefff
>> > (Endpoint 05:00.0 is a RAID bus controller. Endpoints 00.1d.0 and 00.1a.0
>> > are USB controllers.)
>> > 
>> > After DMA remapping is enabled, two DMA translation faults are reported
>> > by VT-d:
>> > (XEN) [9.547924] [VT-D]iommu_enable_translation: iommu->reg =
>> > 82c00021b000
>> > (XEN) [9.550620] [VT-D]iommu_enable_translation: iommu->reg =
>> > 82c00021d000
>> > (XEN) [9.553327] [VT-D]iommu.c:921: iommu_fault_status: Primary
>> > Pending Fault
>> > (XEN) [9.555906] [VT-D]DMAR:[DMA Read] Request device [:00:1a.0]
>> > fault addr 7a3f5000, iommu reg = 82c00021d000
>> > (XEN) [9.558537] [VT-D]DMAR: reason 06 - PTE Read access is not set
>> > (XEN) [9.559860] print_vtd_entries: iommu #1 dev :00:1a.0 gmfn
>> > 7a3f5
>> > (XEN) [9.561179] root_entry[00] = 107277c001
>> > (XEN) [9.562447] context[d0] = 2_1072c06001
>> > (XEN) [9.563776] l4[000] = 9c202f171107
>> > (XEN) [9.565125] l3[001] = 9c202f152107
>> > (XEN) [9.566483] l2[1d1] = 9c10727ce107
>> > (XEN) [9.567821] l1[1f5] = 8000
>> > (XEN) [9.569168] l1[1f5] not present
>> > (XEN) [9.570502] [VT-D]DMAR:[DMA Read] Request device [:00:1d.0]
>> > fault addr 7a3f4000, iommu reg = 82c00021d000
>> > (XEN) [9.573147] [VT-D]DMAR: reason 06 - PTE Read access is not set
>> > (XEN) [9.574488] print_vtd_entries: iommu #1 dev :00:1d.0 gmfn
>> > 7a3f4
>> > (XEN) [9.575819] root_entry[00] = 107277c001
>> > (XEN) [9.577129] context[e8] = 2_1072c06001
>> > (XEN) [9.578439] l4[000] = 9c202f171107
>> > (XEN) [9.579778] l3[001] = 9c202f152107
>> > (XEN) [9.58] l2[1d1] = 9c10727ce107
>> > (XEN) [9.582482] l1[1f4] = 8000
>> > (XEN) [9.583812] l1[1f4] not present
>> &g

Re: [Xen-devel] [PATCH v2 1/4] x86/dom0: prevent access to MMCFG areas for PVH Dom0

2017-09-04 Thread Chao Gao
On Thu, Aug 31, 2017 at 11:09:48AM +0100, Roger Pau Monne wrote:
>On Thu, Aug 31, 2017 at 04:45:23PM +0800, Chao Gao wrote:
>> On Thu, Aug 31, 2017 at 10:03:19AM +0100, Roger Pau Monne wrote:
>> >On Thu, Aug 31, 2017 at 03:32:42PM +0800, Chao Gao wrote:
>> >> On Tue, Aug 29, 2017 at 08:33:25AM +0100, Roger Pau Monne wrote:
>> >> >On Mon, Aug 28, 2017 at 06:18:13AM +, Tian, Kevin wrote:
>> >> >> > From: Roger Pau Monne [mailto:roger@citrix.com]
>> >> >> > Sent: Friday, August 25, 2017 9:59 PM
>> >> >> > 
>> >> >> > On Fri, Aug 25, 2017 at 06:25:36AM -0600, Jan Beulich wrote:
>> >> >> > > >>> On 25.08.17 at 14:15, <roger@citrix.com> wrote:
>> >> >> > > > On Wed, Aug 23, 2017 at 02:16:38AM -0600, Jan Beulich wrote:
>> >> >> > > >> >>> On 22.08.17 at 15:54, <roger@citrix.com> wrote:
>> >> >> > > >> > On Tue, Aug 22, 2017 at 06:26:23AM -0600, Jan Beulich wrote:
>> >> >> > > >> >> >>> On 11.08.17 at 18:43, <roger@citrix.com> wrote:
>> >> >> > > >> >> > --- a/xen/arch/x86/dom0_build.c
>> >> >> > > >> >> > +++ b/xen/arch/x86/dom0_build.c
>> >> >> > > >> >> > @@ -440,6 +440,10 @@ int __init
>> >> >> > dom0_setup_permissions(struct domain *d)
>> >> >> > > >> >> >  rc |= rangeset_add_singleton(mmio_ro_ranges, 
>> >> >> > > >> >> > mfn);
>> >> >> > > >> >> >  }
>> >> >> > > >> >> >
>> >> >> > > >> >> > +/* For PVH prevent access to the MMCFG areas. */
>> >> >> > > >> >> > +if ( dom0_pvh )
>> >> >> > > >> >> > +rc |= pci_mmcfg_set_domain_permissions(d);
>> >> >> > > >> >>
>> >> >> > > >> >> What about ones reported by Dom0 later on? Which then raises 
>> >> >> > > >> >> the
>> >> >> > > >> >> question whether ...
>> >> >> > > >> >
>> >> >> > > >> > This should be dealt with in the PHYSDEVOP_pci_mmcfg_reserved
>> >> >> > handler.
>> >> >> > > >> > But since you propose to do white listing, I guess it doesn't 
>> >> >> > > >> > matter
>> >> >> > > >> > that much anymore.
>> >> >> > > >>
>> >> >> > > >> Well, a fundamental question is whether white listing would 
>> >> >> > > >> work in
>> >> >> > > >> the first place. I could see room for severe problems e.g. with 
>> >> >> > > >> ACPI
>> >> >> > > >> methods wanting to access MMIO that's not described by any PCI
>> >> >> > > >> devices' BARs. Typically that would be regions in the chipset 
>> >> >> > > >> which
>> >> >> > > >> firmware is responsible for configuring/managing, the addresses 
>> >> >> > > >> of
>> >> >> > > >> which can be found/set in custom config space registers.
>> >> >> > > >
>> >> >> > > > The question would also be what would Xen allow in such 
>> >> >> > > > white-listing.
>> >> >> > > > Obviously you can get to map the same using both white-list and
>> >> >> > > > black-listing (see below).
>> >> >> > >
>> >> >> > > Not really - what you've said there regarding MMCFG regions is
>> >> >> > > a clear indication that we should _not_ map reserved regions, i.e.
>> >> >> > > it would need to be full white listing with perhaps just the PCI
>> >> >> > > device BARs being handled automatically.
>> >> >> > 
>> >> >> > I've tried just mapping the BARs and that sadly doesn't work, the box
>> >> >> > hangs after the IOMMU is enabled:
>> >> >> > 
>> >> >> > [...]
>> >> >&

Re: [Xen-devel] [PATCH v5 1/4] VT-d PI: track the number of vcpus on pi blocking list

2017-09-01 Thread Chao Gao
On Fri, Sep 01, 2017 at 03:13:17AM -0600, Jan Beulich wrote:
 On 01.09.17 at 09:55,  wrote:
>> On Fri, Sep 01, 2017 at 02:24:08AM -0600, Jan Beulich wrote:
>> On 01.09.17 at 03:39,  wrote:
 After thinking it again, I want to define the counter as
 a unsigned int variable for the following reasion:
 1. It is definite that the counter is closely related with
 list_add() and list_del(). If the list is protected by the
 lock, it is straightforward that the counter is also protected
 by the lock.
 2. In patch 3, althought there are some lock-less readers, we
 will check the counter still meets our requirement with the lock
 held. Thus, I don't think there is a racing issue.
>>>
>>>I think that's fine, but then you still don't need LOCKed accesses
>>>to the counter for updating it; write_atomic() will suffice afaict.
>> 
>> A stupid question.
>> Is it contradictory that you think the counter can be protected by
>> the lock while suggesting using write_atomic() instead of LOCKed
>> accesses?
>> 
>> updating the counter is always accompanied by updating list and updating
>> list should in locked region. I meaned things like:
>> 
>> spin_lock()
>> list_add()
>> counter++
>> spin_unlock()
>> 
>> However, I am afraid that not using LOCKed accesses but using
>> write_atomic() means something like (separating updating the counter
>> from updating the list I think is not good):
>> 
>> spin_lock()
>> list_add()
>> spin_unlock()
>> write_atomic()
>
>No, I mean
>
> spin_lock()
> list_add()
> write_atomic()
> spin_unlock()
>
>whereas ...
>
>> And I think this version is:
>> 
>> spin_lock()
>> list_add()
>> add_sized()
>> spin_unlock()
>
>... this produces a needless LOCKed instruction redundant with being
>inside the locked region).

it seems add_sized() won't be a LOCKed instruction.
#define build_add_sized(name, size, type, reg) \
static inline void name(volatile type *addr, type val)  \
{   \
asm volatile("add" size " %1,%0"\
 : "=m" (*addr) \
 : reg (val));  \
}

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 1/4] VT-d PI: track the number of vcpus on pi blocking list

2017-09-01 Thread Chao Gao
On Fri, Sep 01, 2017 at 02:24:08AM -0600, Jan Beulich wrote:
 On 01.09.17 at 03:39,  wrote:
>> After thinking it again, I want to define the counter as
>> a unsigned int variable for the following reasion:
>> 1. It is definite that the counter is closely related with
>> list_add() and list_del(). If the list is protected by the
>> lock, it is straightforward that the counter is also protected
>> by the lock.
>> 2. In patch 3, althought there are some lock-less readers, we
>> will check the counter still meets our requirement with the lock
>> held. Thus, I don't think there is a racing issue.
>
>I think that's fine, but then you still don't need LOCKed accesses
>to the counter for updating it; write_atomic() will suffice afaict.

A stupid question.
Is it contradictory that you think the counter can be protected by
the lock while suggesting using write_atomic() instead of LOCKed
accesses?

updating the counter is always accompanied by updating list and updating
list should in locked region. I meaned things like:

spin_lock()
list_add()
counter++
spin_unlock()

However, I am afraid that not using LOCKed accesses but using
write_atomic() means something like (separating updating the counter
from updating the list I think is not good):

spin_lock()
list_add()
spin_unlock()
write_atomic()

And I think this version is:

spin_lock()
list_add()
add_sized()
spin_unlock()

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 1/4] VT-d PI: track the number of vcpus on pi blocking list

2017-08-31 Thread Chao Gao
On Thu, Aug 31, 2017 at 02:33:57AM -0600, Jan Beulich wrote:
 On 31.08.17 at 09:15,  wrote:
>> On Thu, Aug 31, 2017 at 01:42:53AM -0600, Jan Beulich wrote:
>> On 31.08.17 at 00:57,  wrote:
 On Wed, Aug 30, 2017 at 10:00:49AM -0600, Jan Beulich wrote:
 On 16.08.17 at 07:14,  wrote:
>> @@ -100,6 +101,24 @@ void vmx_pi_per_cpu_init(unsigned int cpu)
>>  spin_lock_init(_cpu(vmx_pi_blocking, cpu).lock);
>>  }
>>  
>> +static void vmx_pi_add_vcpu(struct pi_blocking_vcpu *pbv,
>> +struct vmx_pi_blocking_vcpu *vpbv)
>> +{
>> +ASSERT(spin_is_locked(>lock));
>
>You realize this is only a very weak check for a non-recursive lock?
 
 I just thought the lock should be held when adding one entry to the
 blocking list. Do you think we should remove this check or make it
 stricter?
>>>
>>>Well, the primary purpose of my comment was to make you aware
>>>of the fact. If the weak check is good enough for you, then fine.
>> 
>> To be honest, I don't know the difference between weak check and tight
>> check.
>
>For non-recursive locks spin_is_locked() only tells you if _any_
>CPU in the system currently holds the lock. For recursive ones it
>checks whether it's the local CPU that owns the lock.
>
>>>Removing the check would be a bad idea imo (but see also below);
>>>tightening might be worthwhile, but might also go too far (depending
>>>mainly on how clearly provable it is that all callers actually hold the
>>>lock).
>> 
>> IMO, the lock was introduced (not by me) to protect the blocking list.
>> list_add() and list_del() should be performed with the lock held. So I
>> think it is clear that all callers should hold the lock.
>
>Good.
>
>> +add_sized(>counter, 1);
>> +ASSERT(read_atomic(>counter));
>
>Why add_sized() and read_atomic() when you hold the lock?
>
 
 In patch 3, frequent reading the counter is used to find a suitable
 vcpu and we can use add_sized() and read_atomic() to avoid acquiring the
 lock. In one word, the lock doesn't protect the counter.
>>>
>>>In that case it would be more natural to switch to the atomic
>>>accesses there. Plus you still wouldn't need read_atomic()
>>>here, with the lock held. Furthermore I would then wonder
>>>whether it wasn't better to use atomic_t for the counter at
>> 
>> Is there some basic guide on when it is better to use read_atomic()
>> and add_sized() and when it is better to define a atomic variable
>> directly?
>
>If an atomic_t variable fits your needs, I think it should always
>be preferred. add_sized() was introduced for a case where an
>atomic_t variable would not have been usable. Please also
>consult older commits for understanding the background.
>
>>>that point. Also with a lock-less readers the requirement to
>>>hold a lock here (rather than using suitable LOCKed accesses)
>>>becomes questionable too.
>> 
>> As I said above, I think the lock is used to protect the list.
>> 
>> I think this patch has two parts:
>> 1. Move all list operations to two inline functions. (with this, adding
>> a counter is easier and don't need add code in several places.)
>> 
>> 2. Add a counter.
>
>With it being left unclear whether the counter is meant to
>also be protected by the lock: In the patch here you claim it
>is, yet by later introducing lock-less readers you weaken
>that model. Hence the request to bring things into a
>consistent state right away, and ideally also into the final
>state.
>

Hi, Jan.

After thinking it again, I want to define the counter as
a unsigned int variable for the following reasion:
1. It is definite that the counter is closely related with
list_add() and list_del(). If the list is protected by the
lock, it is straightforward that the counter is also protected
by the lock.
2. In patch 3, althought there are some lock-less readers, we
will check the counter still meets our requirement with the lock
held. Thus, I don't think there is a racing issue.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2 1/4] x86/dom0: prevent access to MMCFG areas for PVH Dom0

2017-08-31 Thread Chao Gao
On Thu, Aug 31, 2017 at 10:03:19AM +0100, Roger Pau Monne wrote:
>On Thu, Aug 31, 2017 at 03:32:42PM +0800, Chao Gao wrote:
>> On Tue, Aug 29, 2017 at 08:33:25AM +0100, Roger Pau Monne wrote:
>> >On Mon, Aug 28, 2017 at 06:18:13AM +, Tian, Kevin wrote:
>> >> > From: Roger Pau Monne [mailto:roger@citrix.com]
>> >> > Sent: Friday, August 25, 2017 9:59 PM
>> >> > 
>> >> > On Fri, Aug 25, 2017 at 06:25:36AM -0600, Jan Beulich wrote:
>> >> > > >>> On 25.08.17 at 14:15, <roger@citrix.com> wrote:
>> >> > > > On Wed, Aug 23, 2017 at 02:16:38AM -0600, Jan Beulich wrote:
>> >> > > >> >>> On 22.08.17 at 15:54, <roger@citrix.com> wrote:
>> >> > > >> > On Tue, Aug 22, 2017 at 06:26:23AM -0600, Jan Beulich wrote:
>> >> > > >> >> >>> On 11.08.17 at 18:43, <roger@citrix.com> wrote:
>> >> > > >> >> > --- a/xen/arch/x86/dom0_build.c
>> >> > > >> >> > +++ b/xen/arch/x86/dom0_build.c
>> >> > > >> >> > @@ -440,6 +440,10 @@ int __init
>> >> > dom0_setup_permissions(struct domain *d)
>> >> > > >> >> >  rc |= rangeset_add_singleton(mmio_ro_ranges, 
>> >> > > >> >> > mfn);
>> >> > > >> >> >  }
>> >> > > >> >> >
>> >> > > >> >> > +/* For PVH prevent access to the MMCFG areas. */
>> >> > > >> >> > +if ( dom0_pvh )
>> >> > > >> >> > +rc |= pci_mmcfg_set_domain_permissions(d);
>> >> > > >> >>
>> >> > > >> >> What about ones reported by Dom0 later on? Which then raises the
>> >> > > >> >> question whether ...
>> >> > > >> >
>> >> > > >> > This should be dealt with in the PHYSDEVOP_pci_mmcfg_reserved
>> >> > handler.
>> >> > > >> > But since you propose to do white listing, I guess it doesn't 
>> >> > > >> > matter
>> >> > > >> > that much anymore.
>> >> > > >>
>> >> > > >> Well, a fundamental question is whether white listing would work in
>> >> > > >> the first place. I could see room for severe problems e.g. with 
>> >> > > >> ACPI
>> >> > > >> methods wanting to access MMIO that's not described by any PCI
>> >> > > >> devices' BARs. Typically that would be regions in the chipset which
>> >> > > >> firmware is responsible for configuring/managing, the addresses of
>> >> > > >> which can be found/set in custom config space registers.
>> >> > > >
>> >> > > > The question would also be what would Xen allow in such 
>> >> > > > white-listing.
>> >> > > > Obviously you can get to map the same using both white-list and
>> >> > > > black-listing (see below).
>> >> > >
>> >> > > Not really - what you've said there regarding MMCFG regions is
>> >> > > a clear indication that we should _not_ map reserved regions, i.e.
>> >> > > it would need to be full white listing with perhaps just the PCI
>> >> > > device BARs being handled automatically.
>> >> > 
>> >> > I've tried just mapping the BARs and that sadly doesn't work, the box
>> >> > hangs after the IOMMU is enabled:
>> >> > 
>> >> > [...]
>> >> > (XEN) [VT-D]d0:PCI: map :3f:13.5
>> >> > (XEN) [VT-D]d0:PCI: map :3f:13.6
>> >> > (XEN) [VT-D]iommu_enable_translation: iommu->reg = 82c00021b000
>> >> > 
>> >> > I will park this ATM and leave it for the Intel guys to diagnose.
>> >> > 
>> >> > For the reference, the specific box I'm testing ATM has a Xeon(R) CPU
>> >> > E5-1607 0 @ 3.00GHz and a C600/X79 chipset.
>> >> > 
>> >> 
>> >> +Chao who can help check whether we have such a box at hand.
>> >> 
>> >> btw please also give your BIOS version.
>> >
>> >It's a Precision T3600 BIOS A14.
>> 
>> Hi, Roger.
>> 
>> I found a Ivy bridge box with E5-2697 v2 and tested with "dom0=pvh", and
>
>The ones I've seen issues with are Sandy Bridge or Nehalem, can you
>find some of this hardware?

As I expected, I was removed from recipents :(, which made me
hard to notice your replies in time. 

Yes. I will. But may take some time (for even Ivy Bridge is rare).

>
>I haven't tested Ivy Bridge, but all Haswell boxes I've tested seem to
>work just fine.

The reason why I chose Ivy Bridge partly is you said you found this bug on
almost pre-haswell box.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 1/4] VT-d PI: track the number of vcpus on pi blocking list

2017-08-31 Thread Chao Gao
On Thu, Aug 31, 2017 at 02:33:57AM -0600, Jan Beulich wrote:
 On 31.08.17 at 09:15,  wrote:
>> On Thu, Aug 31, 2017 at 01:42:53AM -0600, Jan Beulich wrote:
>> On 31.08.17 at 00:57,  wrote:
 On Wed, Aug 30, 2017 at 10:00:49AM -0600, Jan Beulich wrote:
 On 16.08.17 at 07:14,  wrote:
>> @@ -100,6 +101,24 @@ void vmx_pi_per_cpu_init(unsigned int cpu)
>>  spin_lock_init(_cpu(vmx_pi_blocking, cpu).lock);
>>  }
>>  
>> +static void vmx_pi_add_vcpu(struct pi_blocking_vcpu *pbv,
>> +struct vmx_pi_blocking_vcpu *vpbv)
>> +{
>> +ASSERT(spin_is_locked(>lock));
>
>You realize this is only a very weak check for a non-recursive lock?
 
 I just thought the lock should be held when adding one entry to the
 blocking list. Do you think we should remove this check or make it
 stricter?
>>>
>>>Well, the primary purpose of my comment was to make you aware
>>>of the fact. If the weak check is good enough for you, then fine.
>> 
>> To be honest, I don't know the difference between weak check and tight
>> check.
>
>For non-recursive locks spin_is_locked() only tells you if _any_
>CPU in the system currently holds the lock. For recursive ones it
>checks whether it's the local CPU that owns the lock.

This remake is impressive to me.

>
>>>Removing the check would be a bad idea imo (but see also below);
>>>tightening might be worthwhile, but might also go too far (depending
>>>mainly on how clearly provable it is that all callers actually hold the
>>>lock).
>> 
>> IMO, the lock was introduced (not by me) to protect the blocking list.
>> list_add() and list_del() should be performed with the lock held. So I
>> think it is clear that all callers should hold the lock.
>
>Good.
>
>> +add_sized(>counter, 1);
>> +ASSERT(read_atomic(>counter));
>
>Why add_sized() and read_atomic() when you hold the lock?
>
 
 In patch 3, frequent reading the counter is used to find a suitable
 vcpu and we can use add_sized() and read_atomic() to avoid acquiring the
 lock. In one word, the lock doesn't protect the counter.
>>>
>>>In that case it would be more natural to switch to the atomic
>>>accesses there. Plus you still wouldn't need read_atomic()
>>>here, with the lock held. Furthermore I would then wonder
>>>whether it wasn't better to use atomic_t for the counter at
>> 
>> Is there some basic guide on when it is better to use read_atomic()
>> and add_sized() and when it is better to define a atomic variable
>> directly?
>
>If an atomic_t variable fits your needs, I think it should always
>be preferred. add_sized() was introduced for a case where an
>atomic_t variable would not have been usable. Please also
>consult older commits for understanding the background.

Ok. I will. Thanks for your guide.

>
>>>that point. Also with a lock-less readers the requirement to
>>>hold a lock here (rather than using suitable LOCKed accesses)
>>>becomes questionable too.
>> 
>> As I said above, I think the lock is used to protect the list.
>> 
>> I think this patch has two parts:
>> 1. Move all list operations to two inline functions. (with this, adding
>> a counter is easier and don't need add code in several places.)
>> 
>> 2. Add a counter.
>
>With it being left unclear whether the counter is meant to
>also be protected by the lock: In the patch here you claim it
>is, yet by later introducing lock-less readers you weaken
>that model. Hence the request to bring things into a
>consistent state right away, and ideally also into the final
>state.

Sure. I will clarify this and make things consistent.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v2 1/4] x86/dom0: prevent access to MMCFG areas for PVH Dom0

2017-08-31 Thread Chao Gao
On Tue, Aug 29, 2017 at 08:33:25AM +0100, Roger Pau Monne wrote:
>On Mon, Aug 28, 2017 at 06:18:13AM +, Tian, Kevin wrote:
>> > From: Roger Pau Monne [mailto:roger@citrix.com]
>> > Sent: Friday, August 25, 2017 9:59 PM
>> > 
>> > On Fri, Aug 25, 2017 at 06:25:36AM -0600, Jan Beulich wrote:
>> > > >>> On 25.08.17 at 14:15,  wrote:
>> > > > On Wed, Aug 23, 2017 at 02:16:38AM -0600, Jan Beulich wrote:
>> > > >> >>> On 22.08.17 at 15:54,  wrote:
>> > > >> > On Tue, Aug 22, 2017 at 06:26:23AM -0600, Jan Beulich wrote:
>> > > >> >> >>> On 11.08.17 at 18:43,  wrote:
>> > > >> >> > --- a/xen/arch/x86/dom0_build.c
>> > > >> >> > +++ b/xen/arch/x86/dom0_build.c
>> > > >> >> > @@ -440,6 +440,10 @@ int __init
>> > dom0_setup_permissions(struct domain *d)
>> > > >> >> >  rc |= rangeset_add_singleton(mmio_ro_ranges, mfn);
>> > > >> >> >  }
>> > > >> >> >
>> > > >> >> > +/* For PVH prevent access to the MMCFG areas. */
>> > > >> >> > +if ( dom0_pvh )
>> > > >> >> > +rc |= pci_mmcfg_set_domain_permissions(d);
>> > > >> >>
>> > > >> >> What about ones reported by Dom0 later on? Which then raises the
>> > > >> >> question whether ...
>> > > >> >
>> > > >> > This should be dealt with in the PHYSDEVOP_pci_mmcfg_reserved
>> > handler.
>> > > >> > But since you propose to do white listing, I guess it doesn't matter
>> > > >> > that much anymore.
>> > > >>
>> > > >> Well, a fundamental question is whether white listing would work in
>> > > >> the first place. I could see room for severe problems e.g. with ACPI
>> > > >> methods wanting to access MMIO that's not described by any PCI
>> > > >> devices' BARs. Typically that would be regions in the chipset which
>> > > >> firmware is responsible for configuring/managing, the addresses of
>> > > >> which can be found/set in custom config space registers.
>> > > >
>> > > > The question would also be what would Xen allow in such white-listing.
>> > > > Obviously you can get to map the same using both white-list and
>> > > > black-listing (see below).
>> > >
>> > > Not really - what you've said there regarding MMCFG regions is
>> > > a clear indication that we should _not_ map reserved regions, i.e.
>> > > it would need to be full white listing with perhaps just the PCI
>> > > device BARs being handled automatically.
>> > 
>> > I've tried just mapping the BARs and that sadly doesn't work, the box
>> > hangs after the IOMMU is enabled:
>> > 
>> > [...]
>> > (XEN) [VT-D]d0:PCI: map :3f:13.5
>> > (XEN) [VT-D]d0:PCI: map :3f:13.6
>> > (XEN) [VT-D]iommu_enable_translation: iommu->reg = 82c00021b000
>> > 
>> > I will park this ATM and leave it for the Intel guys to diagnose.
>> > 
>> > For the reference, the specific box I'm testing ATM has a Xeon(R) CPU
>> > E5-1607 0 @ 3.00GHz and a C600/X79 chipset.
>> > 
>> 
>> +Chao who can help check whether we have such a box at hand.
>> 
>> btw please also give your BIOS version.
>
>It's a Precision T3600 BIOS A14.

Hi, Roger.

I found a Ivy bridge box with E5-2697 v2 and tested with "dom0=pvh", and
the bug didn't occur on this box. The log is below:
XEN) [7.509588] [VT-D]d0:PCIe: map :ff:1e.2
(XEN) [7.511047] [VT-D]d0:PCIe: map :ff:1e.3
(XEN) [7.512463] [VT-D]d0:PCIe: map :ff:1e.4
(XEN) [7.513927] [VT-D]d0:PCIe: map :ff:1e.5
(XEN) [7.515342] [VT-D]d0:PCIe: map :ff:1e.6
(XEN) [7.516808] [VT-D]d0:PCIe: map :ff:1e.7
(XEN) [7.519449] [VT-D]iommu_enable_translation: iommu->reg =
82c00021b000
(XEN) [7.522295] [VT-D]iommu_enable_translation: iommu->reg =
82c00021d000
(XEN) [8.675096] OS: linux version: 2.6 loader: generic bitness:
64-bit
(XEN) [8.726763] 
(XEN) [8.730171] 
(XEN) [8.737491] Panic on CPU 0:
(XEN) [8.742376] Building a PVHv2 Dom0 is not yet supported.
(XEN) [8.750148] 
(XEN) [8.757457] 
(XEN) [8.760877] Reboot in five seconds...
(XEN) [   13.769050] Resetting with ACPI MEMORY or I/O RESET_REG

I agree with you that there may be some bugs in firmware and VT-d.
I did trials on a haswell box with iommu_inclusive_mapping=false. I did
see DMA traslation fault. The address to be translated is reserved in
e820 but isn't included in RMRR. Even that, the box booted up
successfully.

But if the bug exists in pvh dom0, it also exists in pv dom0. Could you
try to boot with pv dom0 and set iommu_inclusive_mapping=false?
Theoretically, the system would halt exactly like what you met in
pvh dom0. Is that right? or I miss some difference between pvh dom0 and
pv dom0.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 1/4] VT-d PI: track the number of vcpus on pi blocking list

2017-08-31 Thread Chao Gao
On Thu, Aug 31, 2017 at 01:42:53AM -0600, Jan Beulich wrote:
 On 31.08.17 at 00:57,  wrote:
>> On Wed, Aug 30, 2017 at 10:00:49AM -0600, Jan Beulich wrote:
>> On 16.08.17 at 07:14,  wrote:
 @@ -100,6 +101,24 @@ void vmx_pi_per_cpu_init(unsigned int cpu)
  spin_lock_init(_cpu(vmx_pi_blocking, cpu).lock);
  }
  
 +static void vmx_pi_add_vcpu(struct pi_blocking_vcpu *pbv,
 +struct vmx_pi_blocking_vcpu *vpbv)
 +{
 +ASSERT(spin_is_locked(>lock));
>>>
>>>You realize this is only a very weak check for a non-recursive lock?
>> 
>> I just thought the lock should be held when adding one entry to the
>> blocking list. Do you think we should remove this check or make it
>> stricter?
>
>Well, the primary purpose of my comment was to make you aware
>of the fact. If the weak check is good enough for you, then fine.

To be honest, I don't know the difference between weak check and tight
check.

>Removing the check would be a bad idea imo (but see also below);
>tightening might be worthwhile, but might also go too far (depending
>mainly on how clearly provable it is that all callers actually hold the
>lock).

IMO, the lock was introduced (not by me) to protect the blocking list.
list_add() and list_del() should be performed with the lock held. So I
think it is clear that all callers should hold the lock.

>
 +add_sized(>counter, 1);
 +ASSERT(read_atomic(>counter));
>>>
>>>Why add_sized() and read_atomic() when you hold the lock?
>>>
>> 
>> In patch 3, frequent reading the counter is used to find a suitable
>> vcpu and we can use add_sized() and read_atomic() to avoid acquiring the
>> lock. In one word, the lock doesn't protect the counter.
>
>In that case it would be more natural to switch to the atomic
>accesses there. Plus you still wouldn't need read_atomic()
>here, with the lock held. Furthermore I would then wonder
>whether it wasn't better to use atomic_t for the counter at

Is there some basic guide on when it is better to use read_atomic()
and add_sized() and when it is better to define a atomic variable
directly?

>that point. Also with a lock-less readers the requirement to
>hold a lock here (rather than using suitable LOCKed accesses)
>becomes questionable too.

As I said above, I think the lock is used to protect the list.

I think this patch has two parts:
1. Move all list operations to two inline functions. (with this, adding
a counter is easier and don't need add code in several places.)

2. Add a counter.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 1/4] VT-d PI: track the number of vcpus on pi blocking list

2017-08-30 Thread Chao Gao
On Wed, Aug 30, 2017 at 10:00:49AM -0600, Jan Beulich wrote:
 On 16.08.17 at 07:14,  wrote:
>> @@ -100,6 +101,24 @@ void vmx_pi_per_cpu_init(unsigned int cpu)
>>  spin_lock_init(_cpu(vmx_pi_blocking, cpu).lock);
>>  }
>>  
>> +static void vmx_pi_add_vcpu(struct pi_blocking_vcpu *pbv,
>> +struct vmx_pi_blocking_vcpu *vpbv)
>> +{
>> +ASSERT(spin_is_locked(>lock));
>
>You realize this is only a very weak check for a non-recursive lock?

I just thought the lock should be held when adding one entry to the
blocking list. Do you think we should remove this check or make it
stricter?

>
>> +add_sized(>counter, 1);
>> +ASSERT(read_atomic(>counter));
>
>Why add_sized() and read_atomic() when you hold the lock?
>

In patch 3, frequent reading the counter is used to find a suitable
vcpu and we can use add_sized() and read_atomic() to avoid acquiring the
lock. In one word, the lock doesn't protect the counter.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v10] VT-d: use correct BDF for VF to search VT-d unit

2017-08-28 Thread Chao Gao
On Mon, Aug 28, 2017 at 02:16:18AM -0600, Jan Beulich wrote:
>>>> On 28.08.17 at 04:42, <chao@intel.com> wrote:
>> When SR-IOV is enabled, 'Virtual Functions' of a 'Physical Function'
>> are under the scope of the same VT-d unit as the 'Physical Function'.
>> A 'Physical Function' can be a 'Traditional Function' or an ARI
>> 'Extended Function'. And furthermore, 'Extended Functions' on an
>> endpoint are under the scope of the same VT-d unit as the 'Traditional
>> Functions' on the endpoint. To search VT-d unit for a VF, if its PF
>> isn't an extended function, the BDF of PF should be used. Otherwise
>> the BDF of a traditional function in the same device with the PF
>> should be used.
>> 
>> Current code uses PCI_SLOT() to recognize an ARI 'Extended Funcion'.
>> But it is conceptually wrong w/o checking whether PF is an extended
>> function and would lead to match VFs of a RC integrated PF to a wrong
>> VT-d unit.
>> 
>> This patch overrides VF 'is_extfn' field and uses this field to
>> indicate whether the PF of this VF is an extended function. The field
>> helps to use correct BDF to search VT-d unit.
>> 
>> Reported-by: Crawford, Eric R <eric.r.crawf...@intel.com>
>> Signed-off-by: Chao Gao <chao@intel.com>
>
>Acked-by: Jan Beulich <jbeul...@suse.com>
>albeit ...
>
>> --- a/xen/drivers/passthrough/vtd/dmar.c
>> +++ b/xen/drivers/passthrough/vtd/dmar.c
>> @@ -211,15 +211,15 @@ struct acpi_drhd_unit 
>> *acpi_find_matched_drhd_unit(const struct pci_dev *pdev)
>>  if ( pdev == NULL )
>>  return NULL;
>>  
>> -if ( pdev->info.is_extfn )
>> +if ( pdev->info.is_virtfn )
>>  {
>> -bus = pdev->bus;
>> -devfn = 0;
>> +bus = pdev->info.physfn.bus;
>> +devfn = (!pdev->info.is_extfn) ? pdev->info.physfn.devfn : 0;
>
>... if I end up committing this and if I don't forget, I'll likely take the
>liberty to remove the pointless parentheses here.
>

Hi, Eric.

Could you test this patch again and give your Tested-by if it fixes the
problem you reported?

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v10] VT-d: use correct BDF for VF to search VT-d unit

2017-08-27 Thread Chao Gao
When SR-IOV is enabled, 'Virtual Functions' of a 'Physical Function'
are under the scope of the same VT-d unit as the 'Physical Function'.
A 'Physical Function' can be a 'Traditional Function' or an ARI
'Extended Function'. And furthermore, 'Extended Functions' on an
endpoint are under the scope of the same VT-d unit as the 'Traditional
Functions' on the endpoint. To search VT-d unit for a VF, if its PF
isn't an extended function, the BDF of PF should be used. Otherwise
the BDF of a traditional function in the same device with the PF
should be used.

Current code uses PCI_SLOT() to recognize an ARI 'Extended Funcion'.
But it is conceptually wrong w/o checking whether PF is an extended
function and would lead to match VFs of a RC integrated PF to a wrong
VT-d unit.

This patch overrides VF 'is_extfn' field and uses this field to
indicate whether the PF of this VF is an extended function. The field
helps to use correct BDF to search VT-d unit.

Reported-by: Crawford, Eric R <eric.r.crawf...@intel.com>
Signed-off-by: Chao Gao <chao@intel.com>
---
v10:
 - move setting vf's is_extfn closer to the place where we set other fields.
 - reverse the conditional expression in acpi_find_matched_drhd_unit()

v9:
 - check 'is_virtfn' first in pci_add_device() to avoid potential error if
 linux side sets VF's 'is_extfn'
 - comments changes to make it clear that we use VF's 'is_extfn' intentionally
 otherwise the patch seems like a workaround.

v8:
 - use "conceptually wrong", instead of "a corner case" in commit message
 - check 'is_virtfn' first in acpi_find_matched_drhd_unit()

v7:
 - Drop Eric's tested-by
 - Change commit message to be clearer
 - Re-use VF's is_extfn field
 - access PF's is_extfn field in locked area

---
 xen/drivers/passthrough/pci.c  | 19 +++
 xen/drivers/passthrough/vtd/dmar.c | 12 ++--
 xen/include/xen/pci.h  |  4 
 3 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 27bdb71..187a9e7 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -599,21 +599,24 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
 unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
 const char *pdev_type;
 int ret;
+bool pf_is_extfn = false;
 
-if (!info)
+if ( !info )
 pdev_type = "device";
-else if (info->is_extfn)
-pdev_type = "extended function";
-else if (info->is_virtfn)
+else if ( info->is_virtfn )
 {
 pcidevs_lock();
 pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn);
+if ( pdev )
+pf_is_extfn = pdev->info.is_extfn;
 pcidevs_unlock();
 if ( !pdev )
 pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
NULL, node);
 pdev_type = "virtual function";
 }
+else if ( info->is_extfn )
+pdev_type = "extended function";
 else
 {
 info = NULL;
@@ -637,7 +640,15 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
 pdev->node = node;
 
 if ( info )
+{
 pdev->info = *info;
+/*
+ * VF's 'is_extfn' field is used to indicate whether its PF is an
+ * extended function.
+ */
+if ( pdev->info.is_virtfn )
+pdev->info.is_extfn = pf_is_extfn;
+}
 else if ( !pdev->vf_rlen[0] )
 {
 unsigned int pos = pci_find_ext_capability(seg, bus, devfn,
diff --git a/xen/drivers/passthrough/vtd/dmar.c 
b/xen/drivers/passthrough/vtd/dmar.c
index 82040dd..9676471 100644
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -211,15 +211,15 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const 
struct pci_dev *pdev)
 if ( pdev == NULL )
 return NULL;
 
-if ( pdev->info.is_extfn )
+if ( pdev->info.is_virtfn )
 {
-bus = pdev->bus;
-devfn = 0;
+bus = pdev->info.physfn.bus;
+devfn = (!pdev->info.is_extfn) ? pdev->info.physfn.devfn : 0;
 }
-else if ( pdev->info.is_virtfn )
+else if ( pdev->info.is_extfn )
 {
-bus = pdev->info.physfn.bus;
-devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : 
pdev->info.physfn.devfn;
+bus = pdev->bus;
+devfn = 0;
 }
 else
 {
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 59b6e8a..da1bd22 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -39,6 +39,10 @@
 #define PCI_SBDF3(s,b,df) s) & 0x) << 16) | PCI_BDF2(b, df))
 
 struct pci_dev_info {
+/*
+ * VF's 'is_extfn' field is used to indicate whether its PF is an extended
+ * function.
+ */
 bool_t is_extfn;
 bool_t is_virtfn;
 struct {
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH RESEND v9] VT-d: use correct BDF for VF to search VT-d unit

2017-08-25 Thread Chao Gao
On Fri, Aug 25, 2017 at 09:20:23AM -0600, Jan Beulich wrote:
>>>> On 25.08.17 at 15:51, <chao@intel.com> wrote:
>> On Fri, Aug 25, 2017 at 03:39:38AM -0600, Jan Beulich wrote:
>>>>>> On 25.08.17 at 07:27, <chao@intel.com> wrote:
>>>> When SR-IOV is enabled, 'Virtual Functions' of a 'Physical Function' are 
>>>> under
>>>> the scope of the same VT-d unit as the 'Physical Function'. A 'Physical
>>>> Function' can be a 'Traditional Function' or an ARI 'Extended Function'.
>>>> And furthermore, 'Extended Functions' on an endpoint are under the scope of
>>>> the same VT-d unit as the 'Traditional Functions' on the endpoint. To 
>>>> search
>>>> VT-d unit, the BDF of PF or the BDF of a traditional function may be used. 
>>>> And
>>>> it depends on whether the PF is an extended function or not.
>>>> 
>>>> Current code uses PCI_SLOT() to recognize an ARI 'Extended Funcion'. But it
>>>> is conceptually wrong w/o checking whether PF is an extended function and
>>>> would lead to match VFs of a RC endpoint to a wrong VT-d unit.
>>>> 
>>>> This patch uses VF's 'is_extfn' field to indicate whether the PF of this 
>>>> VF 
>>>> is
>>>> an extended function. The field helps to use correct BDF to search VT-d 
>>>> unit.
>>>> 
>>>> Reported-by: Crawford, Eric R <eric.r.crawf...@intel.com>
>>>> Signed-off-by: Chao Gao <chao@intel.com>
>>>> ---
>>>>  - RESEND for the previous email has no subject. 
>>>> 
>>>> v9:
>>>>  - check 'is_virtfn' first in pci_add_device() to avoid potential error if
>>>>  linux side sets VF's 'is_extfn'
>>>>  - comments changes to make it clear that we use VF's 'is_extfn' 
>>>> intentionally
>>>>  otherwise the patch seems like a workaround.
>>>> 
>>>> v8:
>>>>  - use "conceptually wrong", instead of "a corner case" in commit message
>>>>  - check 'is_virtfn' first in acpi_find_matched_drhd_unit()
>>>> 
>>>> v7:
>>>>  - Drop Eric's tested-by
>>>>  - Change commit message to be clearer
>>>>  - Re-use VF's is_extfn field
>>>>  - access PF's is_extfn field in locked area
>>>> 
>>>> ---
>>>>  xen/drivers/passthrough/pci.c  | 14 ++
>>>>  xen/drivers/passthrough/vtd/dmar.c | 12 ++--
>>>>  xen/include/xen/pci.h  |  1 +
>>>>  3 files changed, 17 insertions(+), 10 deletions(-)
>>>> 
>>>> diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
>>>> index 27bdb71..0e27e29 100644
>>>> --- a/xen/drivers/passthrough/pci.c
>>>> +++ b/xen/drivers/passthrough/pci.c
>>>> @@ -599,21 +599,24 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>>>>  unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
>>>>  const char *pdev_type;
>>>>  int ret;
>>>> +bool pf_is_extfn = false;
>>>>  
>>>> -if (!info)
>>>> +if ( !info )
>>>>  pdev_type = "device";
>>>> -else if (info->is_extfn)
>>>> -pdev_type = "extended function";
>>>> -else if (info->is_virtfn)
>>>> +else if ( info->is_virtfn )
>>>>  {
>>>>  pcidevs_lock();
>>>>  pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn);
>>>> +if ( pdev )
>>>> +pf_is_extfn = pdev->info.is_extfn;
>>>>  pcidevs_unlock();
>>>>  if ( !pdev )
>>>>  pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
>>>> NULL, node);
>>>>  pdev_type = "virtual function";
>>>>  }
>>>> +else if ( info->is_extfn )
>>>> +pdev_type = "extended function";
>>>>  else
>>>>  {
>>>>  info = NULL;
>>>> @@ -707,6 +710,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>>>> seg, bus, slot, func, ctrl);
>>>>  }
>>>>  
>>>> +/* VF's 'is_extfn' is used to indicate whether PF is an extended 
>> function */
>>>> +if ( pdev->info.is_virt

Re: [Xen-devel] [PATCH RESEND v9] VT-d: use correct BDF for VF to search VT-d unit

2017-08-25 Thread Chao Gao
On Fri, Aug 25, 2017 at 03:39:38AM -0600, Jan Beulich wrote:
>>>> On 25.08.17 at 07:27, <chao@intel.com> wrote:
>> When SR-IOV is enabled, 'Virtual Functions' of a 'Physical Function' are 
>> under
>> the scope of the same VT-d unit as the 'Physical Function'. A 'Physical
>> Function' can be a 'Traditional Function' or an ARI 'Extended Function'.
>> And furthermore, 'Extended Functions' on an endpoint are under the scope of
>> the same VT-d unit as the 'Traditional Functions' on the endpoint. To search
>> VT-d unit, the BDF of PF or the BDF of a traditional function may be used. 
>> And
>> it depends on whether the PF is an extended function or not.
>> 
>> Current code uses PCI_SLOT() to recognize an ARI 'Extended Funcion'. But it
>> is conceptually wrong w/o checking whether PF is an extended function and
>> would lead to match VFs of a RC endpoint to a wrong VT-d unit.
>> 
>> This patch uses VF's 'is_extfn' field to indicate whether the PF of this VF 
>> is
>> an extended function. The field helps to use correct BDF to search VT-d unit.
>> 
>> Reported-by: Crawford, Eric R <eric.r.crawf...@intel.com>
>> Signed-off-by: Chao Gao <chao@intel.com>
>> ---
>>  - RESEND for the previous email has no subject. 
>> 
>> v9:
>>  - check 'is_virtfn' first in pci_add_device() to avoid potential error if
>>  linux side sets VF's 'is_extfn'
>>  - comments changes to make it clear that we use VF's 'is_extfn' 
>> intentionally
>>  otherwise the patch seems like a workaround.
>> 
>> v8:
>>  - use "conceptually wrong", instead of "a corner case" in commit message
>>  - check 'is_virtfn' first in acpi_find_matched_drhd_unit()
>> 
>> v7:
>>  - Drop Eric's tested-by
>>  - Change commit message to be clearer
>>  - Re-use VF's is_extfn field
>>  - access PF's is_extfn field in locked area
>> 
>> ---
>>  xen/drivers/passthrough/pci.c  | 14 ++
>>  xen/drivers/passthrough/vtd/dmar.c | 12 ++--
>>  xen/include/xen/pci.h  |  1 +
>>  3 files changed, 17 insertions(+), 10 deletions(-)
>> 
>> diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
>> index 27bdb71..0e27e29 100644
>> --- a/xen/drivers/passthrough/pci.c
>> +++ b/xen/drivers/passthrough/pci.c
>> @@ -599,21 +599,24 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>>  unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
>>  const char *pdev_type;
>>  int ret;
>> +bool pf_is_extfn = false;
>>  
>> -if (!info)
>> +if ( !info )
>>  pdev_type = "device";
>> -else if (info->is_extfn)
>> -pdev_type = "extended function";
>> -else if (info->is_virtfn)
>> +else if ( info->is_virtfn )
>>  {
>>  pcidevs_lock();
>>  pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn);
>> +if ( pdev )
>> +pf_is_extfn = pdev->info.is_extfn;
>>  pcidevs_unlock();
>>  if ( !pdev )
>>  pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
>> NULL, node);
>>  pdev_type = "virtual function";
>>  }
>> +else if ( info->is_extfn )
>> +pdev_type = "extended function";
>>  else
>>  {
>>  info = NULL;
>> @@ -707,6 +710,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>> seg, bus, slot, func, ctrl);
>>  }
>>  
>> +/* VF's 'is_extfn' is used to indicate whether PF is an extended 
>> function */
>> +if ( pdev->info.is_virtfn )
>> +pdev->info.is_extfn = pf_is_extfn;
>>  check_pdev(pdev);
>
>Can this please be moved up right next to
>
>pdev->info = *info;
>
>, so that information is right from the point it is being stored? And

Yes. I will.

>looking at that code I can't really work out why the SR-IOV device
>handling is in an "else if" to that path. I can't check that case
>myself, as by box'es root ports don't support ARI forwarding, so
>despite PF and VF being ARI-capable it can't be enabled, and
>hence I'm not seeing the devices reported as extended functions.

Yeah. I think we should remove "else if" for it is the only place
where vf_rlen[] is set, otherwise extended PF's vf_rlen[] won't be
initialized. I think we don't have extended PF at present, so the bug
isn't triggered.  Currently, VF won't implement SRIOV feature, seeing
SRIOV specv1.1 chapter 3.7 PCI Express Extended Capabilities. Even VF
will implement SRIOV later, I think as long as a function is SRIOV
capable, we can initialize vf_rlen[] here.

Do you think it is bug? if yes, should it be fixed in this patch?

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V2 25/25] x86/vvtd: save and restore emulated VT-d

2017-08-25 Thread Chao Gao
On Fri, Aug 25, 2017 at 03:00:32AM -0600, Jan Beulich wrote:
>>>> On 25.08.17 at 08:35, <chao@intel.com> wrote:
>> On Wed, Aug 23, 2017 at 01:19:41PM +0100, Roger Pau Monné wrote:
>>>On Wed, Aug 09, 2017 at 04:34:26PM -0400, Lan Tianyu wrote:
>>>> From: Chao Gao <chao@intel.com>
>>>> 
>>>> Wrap some useful status in a new structure hvm_hw_vvtd, following
>>>> the customs of vlapic, vioapic and etc. Provide two save-restore
>>>> pairs to save/restore registers and non-register status.
>>>> 
>>>> Signed-off-by: Chao Gao <chao@intel.com>
>>>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>>>> ---
>>>> diff --git a/xen/include/public/arch-x86/hvm/save.h 
>> b/xen/include/public/arch-x86/hvm/save.h
>>>> index fd7bf3f..10536cb 100644
>>>> --- a/xen/include/public/arch-x86/hvm/save.h
>>>> +++ b/xen/include/public/arch-x86/hvm/save.h
>>>> @@ -639,10 +639,32 @@ struct hvm_msr {
>>>>  
>>>>  #define CPU_MSR_CODE  20
>>>>  
>>>> +struct hvm_hw_vvtd_regs {
>>>> +uint8_t data[1024];
>>>> +};
>>>> +
>>>> +DECLARE_HVM_SAVE_TYPE(IOMMU_REGS, 21, struct hvm_hw_vvtd_regs);
>>>> +
>>>> +struct hvm_hw_vvtd
>>>> +{
>>>> +/* VIOMMU_STATUS_XXX */
>>>> +uint32_t status;
>>>> +/* Fault Recording index */
>>>> +uint32_t frcd_idx;
>>>> +/* Is in Extended Interrupt Mode? */
>>>> +uint32_t eim;
>>>> +/* Max remapping entries in IRT */
>>>> +uint32_t irt_max_entry;
>>>> +/* Interrupt remapping table base gfn */
>>>> +uint64_t irt;
>>>> +};
>>>> +
>>>> +DECLARE_HVM_SAVE_TYPE(IOMMU, 22, struct hvm_hw_vvtd);
>>>
>>>Why two separate structures? It should be the same structure.
>> 
>> Hi, Roger.
>> 
>> Thank you for your review. I agree with most of your comments on the
>> whole series. I will only reply to some points I think still need
>> discussion.
>> 
>> Here we use two separate structures for some field cannot be infered
>> from the struct hvm_hw_vvtd_regs. For example, the 'irt' is the gfn of
>> the base address Interrupt Remapping Table. The field is set through
>> 1. set the register DMAR_IRTE_REG in hvm_hw_vvtd_regs.
>> 2. send a command to vtd by writting another command register.
>> 
>> If the current base address is A, and guest wants to update the base
>> address to B and finish the first step. Unfortunately, saving and
>> restoring happen here. In this case, we need the struct hvm_hw_vvtd
>> to correctly restore some information.
>
>Hmm, the way I've understood Roger's question is why you
>don't combine the two structures into one, not whether one
>of the two can be omitted.

It seems likely that they can be combined. will give it a try.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V2 21/25] tools/libxc: Add a new interface to bind remapping format msi with pirq

2017-08-25 Thread Chao Gao
On Wed, Aug 23, 2017 at 11:41:25AM +0100, Roger Pau Monné wrote:
>On Wed, Aug 09, 2017 at 04:34:22PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>> 
>> Introduce a new binding relationship and provide a new interface to
>> manage the new relationship.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>> ---
>>  pirq_dpci->gmsi.posted = false;
>>  vcpu = (dest_vcpu_id >= 0) ? d->vcpu[dest_vcpu_id] : NULL;
>> -if ( iommu_intpost )
>> +/* Currently, don't use interrupt posting for guest's remapping 
>> MSIs */
>> +if ( iommu_intpost && !ir )
>>  {
>>  if ( delivery_mode == dest_LowestPrio )
>>  vcpu = vector_hashing_dest(d, dest, dest_mode,
>> @@ -435,7 +527,7 @@ int pt_irq_create_bind(
>>  hvm_migrate_pirqs(d->vcpu[dest_vcpu_id]);
>>  
>>  /* Use interrupt posting if it is supported. */
>> -if ( iommu_intpost )
>> +if ( iommu_intpost && !ir )
>
>So with interrupt remapping posted interrupts are not available...

Yes. We want to make thing simple. Currently, all vIRTE isn't
cached by vvtd and thus we needn't do anything when guest try to flush
vIRTE. If we use posted interrupt here, it means some information will
be cached by physical VTd. In that case, we should push effort to flush
correspond phyiscal IRTE. We don't include these patches in this
series.

Thanks
Chao


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V2 23/25] x86/vvtd: Handle interrupt translation faults

2017-08-25 Thread Chao Gao
On Wed, Aug 23, 2017 at 12:51:27PM +0100, Roger Pau Monné wrote:
>On Wed, Aug 09, 2017 at 04:34:24PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>> 
>> Interrupt translation faults are non-recoverable fault. When faults
>   ^ faults
>> are triggered, it needs to populate fault info to Fault Recording
>> Registers and inject vIOMMU msi interrupt to notify guest IOMMU driver
>> to deal with faults.
>> 
>> This patch emulates hardware's handling interrupt translation
>> faults (more information about the process can be found in VT-d spec,
>> chipter "Translation Faults", section "Non-Recoverable Fault
>  ^ chapter
>> Reporting" and section "Non-Recoverable Logging").
>> Specifically, viommu_record_fault() records the fault information and
>> viommu_report_non_recoverable_fault() reports faults to software.
>> Currently, only Primary Fault Logging is supported and the Number of
>> Fault-recording Registers is 1.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>> ---
>
>>  /* Address range of remapping hardware register-set */
>>  uint64_t base_addr;
>>  uint64_t length;
>> @@ -97,6 +101,23 @@ static inline struct vvtd *vcpu_vvtd(struct vcpu *v)
>>  return domain_vvtd(v->domain);
>>  }
>>  
>> +static inline int vvtd_test_and_set_bit(struct vvtd *vvtd, uint32_t reg,
>> +int nr)
>
>unsigned int for nr, and I'm not really sure the usefulness of this
>helpers. In any case inline should not be used and instead let the
>compiler optimize this.
>

I think compiler doesn't know the frequency of calling these function.
Explicitly make this function inline sometimes can avoid compiler
doesn't do this for some short and frequently used functions.

>> +static void vvtd_report_non_recoverable_fault(struct vvtd *vvtd, int reason)
>> +{
>> +uint32_t fsts;
>> +
>> +ASSERT(reason & DMA_FSTS_FAULTS);
>> +fsts = vvtd_get_reg(vvtd, DMAR_FSTS_REG);
>> +__vvtd_set_bit(vvtd, DMAR_FSTS_REG, reason);
>
>I don't understand this, is reason a bit position or a mask?
>
>DMA_FSTS_FAULTS seems to be a mask, that should be set into DMAR_FSTS_REG?

According VT-d spec 10.4.9, Each kind of fault is denoted by one bit in
DMAR_FSTS_REG.

>>  static int vvtd_record_fault(struct vvtd *vvtd,
>> - struct irq_remapping_request *irq,
>> + struct irq_remapping_request *request,
>>   int reason)
>>  {
>> -return 0;
>> +struct vtd_fault_record_register frcd;
>> +int frcd_idx;
>> +
>> +switch(reason)
>> +{
>> +case VTD_FR_IR_REQ_RSVD:
>> +case VTD_FR_IR_INDEX_OVER:
>> +case VTD_FR_IR_ENTRY_P:
>> +case VTD_FR_IR_ROOT_INVAL:
>> +case VTD_FR_IR_IRTE_RSVD:
>> +case VTD_FR_IR_REQ_COMPAT:
>> +case VTD_FR_IR_SID_ERR:
>> +if ( vvtd_test_bit(vvtd, DMAR_FSTS_REG, DMA_FSTS_PFO_BIT) )
>> +return X86EMUL_OKAY;
>> +
>> +/* No available Fault Record means Fault overflowed */
>> +frcd_idx = vvtd_alloc_frcd(vvtd);
>> +if ( frcd_idx == -1 )
>> +{
>> +vvtd_report_non_recoverable_fault(vvtd, DMA_FSTS_PFO_BIT);
>> +return X86EMUL_OKAY;
>> +}
>> +memset(, 0, sizeof(frcd));
>> +frcd.fields.FR = (u8)reason;
>> +frcd.fields.FI = ((u64)irq_remapping_request_index(request)) << 36;
>> +frcd.fields.SID = (u16)request->source_id;
>> +frcd.fields.F = 1;
>> +vvtd_commit_frcd(vvtd, frcd_idx, );
>> +return X86EMUL_OKAY;
>> +
>> +default:
>
>Other reasons are just ignored? Should this have an ASSERT_UNREACHABLE
>maybe?

It can have for all the faults are raised by vvtd. When vvtd generates a
new kinds of fault, the corresponding handler also should be added.

>
>> +break;
>> +}
>> +
>> +gdprintk(XENLOG_ERR, "Can't handle vVTD Fault (reason 0x%x).", reason);
>> +domain_crash(vvtd->domain);
>
>Oh, I see. Is it expected that such faults with unhandled reasons can
>be somehow generated by the domain itself?
>

No. Faults are generated by vvtd. We only add interrupt translation
faults. Other faults can be added when adding other features (e.g. DMA
remapping). 


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V2 25/25] x86/vvtd: save and restore emulated VT-d

2017-08-25 Thread Chao Gao
On Wed, Aug 23, 2017 at 01:19:41PM +0100, Roger Pau Monné wrote:
>On Wed, Aug 09, 2017 at 04:34:26PM -0400, Lan Tianyu wrote:
>> From: Chao Gao <chao@intel.com>
>> 
>> Wrap some useful status in a new structure hvm_hw_vvtd, following
>> the customs of vlapic, vioapic and etc. Provide two save-restore
>> pairs to save/restore registers and non-register status.
>> 
>> Signed-off-by: Chao Gao <chao@intel.com>
>> Signed-off-by: Lan Tianyu <tianyu@intel.com>
>> ---
>> diff --git a/xen/include/public/arch-x86/hvm/save.h 
>> b/xen/include/public/arch-x86/hvm/save.h
>> index fd7bf3f..10536cb 100644
>> --- a/xen/include/public/arch-x86/hvm/save.h
>> +++ b/xen/include/public/arch-x86/hvm/save.h
>> @@ -639,10 +639,32 @@ struct hvm_msr {
>>  
>>  #define CPU_MSR_CODE  20
>>  
>> +struct hvm_hw_vvtd_regs {
>> +uint8_t data[1024];
>> +};
>> +
>> +DECLARE_HVM_SAVE_TYPE(IOMMU_REGS, 21, struct hvm_hw_vvtd_regs);
>> +
>> +struct hvm_hw_vvtd
>> +{
>> +/* VIOMMU_STATUS_XXX */
>> +uint32_t status;
>> +/* Fault Recording index */
>> +uint32_t frcd_idx;
>> +/* Is in Extended Interrupt Mode? */
>> +uint32_t eim;
>> +/* Max remapping entries in IRT */
>> +uint32_t irt_max_entry;
>> +/* Interrupt remapping table base gfn */
>> +uint64_t irt;
>> +};
>> +
>> +DECLARE_HVM_SAVE_TYPE(IOMMU, 22, struct hvm_hw_vvtd);
>
>Why two separate structures? It should be the same structure.

Hi, Roger.

Thank you for your review. I agree with most of your comments on the
whole series. I will only reply to some points I think still need
discussion.

Here we use two separate structures for some field cannot be infered
from the struct hvm_hw_vvtd_regs. For example, the 'irt' is the gfn of
the base address Interrupt Remapping Table. The field is set through
1. set the register DMAR_IRTE_REG in hvm_hw_vvtd_regs.
2. send a command to vtd by writting another command register.

If the current base address is A, and guest wants to update the base
address to B and finish the first step. Unfortunately, saving and
restoring happen here. In this case, we need the struct hvm_hw_vvtd
to correctly restore some information.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH RESEND v9] VT-d: use correct BDF for VF to search VT-d unit

2017-08-25 Thread Chao Gao
When SR-IOV is enabled, 'Virtual Functions' of a 'Physical Function' are under
the scope of the same VT-d unit as the 'Physical Function'. A 'Physical
Function' can be a 'Traditional Function' or an ARI 'Extended Function'.
And furthermore, 'Extended Functions' on an endpoint are under the scope of
the same VT-d unit as the 'Traditional Functions' on the endpoint. To search
VT-d unit, the BDF of PF or the BDF of a traditional function may be used. And
it depends on whether the PF is an extended function or not.

Current code uses PCI_SLOT() to recognize an ARI 'Extended Funcion'. But it
is conceptually wrong w/o checking whether PF is an extended function and
would lead to match VFs of a RC endpoint to a wrong VT-d unit.

This patch uses VF's 'is_extfn' field to indicate whether the PF of this VF is
an extended function. The field helps to use correct BDF to search VT-d unit.

Reported-by: Crawford, Eric R <eric.r.crawf...@intel.com>
Signed-off-by: Chao Gao <chao@intel.com>
---
 - RESEND for the previous email has no subject. 

v9:
 - check 'is_virtfn' first in pci_add_device() to avoid potential error if
 linux side sets VF's 'is_extfn'
 - comments changes to make it clear that we use VF's 'is_extfn' intentionally
 otherwise the patch seems like a workaround.

v8:
 - use "conceptually wrong", instead of "a corner case" in commit message
 - check 'is_virtfn' first in acpi_find_matched_drhd_unit()

v7:
 - Drop Eric's tested-by
 - Change commit message to be clearer
 - Re-use VF's is_extfn field
 - access PF's is_extfn field in locked area

---
 xen/drivers/passthrough/pci.c  | 14 ++
 xen/drivers/passthrough/vtd/dmar.c | 12 ++--
 xen/include/xen/pci.h  |  1 +
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 27bdb71..0e27e29 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -599,21 +599,24 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
 unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
 const char *pdev_type;
 int ret;
+bool pf_is_extfn = false;
 
-if (!info)
+if ( !info )
 pdev_type = "device";
-else if (info->is_extfn)
-pdev_type = "extended function";
-else if (info->is_virtfn)
+else if ( info->is_virtfn )
 {
 pcidevs_lock();
 pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn);
+if ( pdev )
+pf_is_extfn = pdev->info.is_extfn;
 pcidevs_unlock();
 if ( !pdev )
 pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
NULL, node);
 pdev_type = "virtual function";
 }
+else if ( info->is_extfn )
+pdev_type = "extended function";
 else
 {
 info = NULL;
@@ -707,6 +710,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
seg, bus, slot, func, ctrl);
 }
 
+/* VF's 'is_extfn' is used to indicate whether PF is an extended function 
*/
+if ( pdev->info.is_virtfn )
+pdev->info.is_extfn = pf_is_extfn;
 check_pdev(pdev);
 
 ret = 0;
diff --git a/xen/drivers/passthrough/vtd/dmar.c 
b/xen/drivers/passthrough/vtd/dmar.c
index 82040dd..75c9c92 100644
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -211,15 +211,15 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const 
struct pci_dev *pdev)
 if ( pdev == NULL )
 return NULL;
 
-if ( pdev->info.is_extfn )
+if ( pdev->info.is_virtfn )
 {
-bus = pdev->bus;
-devfn = 0;
+bus = pdev->info.physfn.bus;
+devfn = pdev->info.is_extfn ? 0 : pdev->info.physfn.devfn;
 }
-else if ( pdev->info.is_virtfn )
+else if ( pdev->info.is_extfn )
 {
-bus = pdev->info.physfn.bus;
-devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : 
pdev->info.physfn.devfn;
+bus = pdev->bus;
+devfn = 0;
 }
 else
 {
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 59b6e8a..4dd42ac 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -39,6 +39,7 @@
 #define PCI_SBDF3(s,b,df) s) & 0x) << 16) | PCI_BDF2(b, df))
 
 struct pci_dev_info {
+/* VF's 'is_extfn' is used to show whether its PF an extended function */
 bool_t is_extfn;
 bool_t is_virtfn;
 struct {
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v8] VT-d: use correct BDF for VF to search VT-d unit

2017-08-25 Thread Chao Gao
I have sent out a new version, let's skip this one.

Thanks
Chao

On Fri, Aug 25, 2017 at 12:17:15PM +0800, Chao Gao wrote:
>When SR-IOV is enabled, 'Virtual Functions' of a 'Physical Function' are under
>the scope of the same VT-d unit as the 'Physical Function'. A 'Physical
>Function' can be a 'Traditional Function' or an ARI 'Extended Function'.
>And furthermore, 'Extended Functions' on an endpoint are under the scope of
>the same VT-d unit as the 'Traditional Functions' on the endpoint. To search
>VT-d unit, the BDF of PF or the BDF of a traditional function may be used. And
>it depends on whether the PF is an extended function or not.
>
>Current code uses PCI_SLOT() to recognize an ARI 'Extended Funcion'. But it
>is conceptually wrong w/o checking whether PF is an extended function and
>would lead to match VFs of a RC endpoint to a wrong VT-d unit.
>
>This patch reuses 'is_extfn' field in VF's struct pci_dev_info to indicate
>whether the PF of this VF is an extended function. The field helps to use
>correct BDF to search VT-d unit.
>
>Reported-by: Crawford, Eric R <eric.r.crawf...@intel.com>
>Signed-off-by: Chao Gao <chao@intel.com>
>---
>v8:
> - use "conceptually wrong", instead of "a corner case" in commit message
> - check 'is_virtfn' first in acpi_find_matched_drhd_unit()
>
>v7:
> - Drop Eric's tested-by
> - Change commit message to be clearer
> - Re-use VF's is_extfn field
> - access PF's is_extfn field in locked area
>
>---
> xen/drivers/passthrough/pci.c  |  6 ++
> xen/drivers/passthrough/vtd/dmar.c | 12 ++--
> xen/include/xen/pci.h  |  4 
> 3 files changed, 16 insertions(+), 6 deletions(-)
>
>diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
>index 27bdb71..2a91320 100644
>--- a/xen/drivers/passthrough/pci.c
>+++ b/xen/drivers/passthrough/pci.c
>@@ -599,6 +599,7 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
> unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
> const char *pdev_type;
> int ret;
>+bool pf_is_extfn = false;
> 
> if (!info)
> pdev_type = "device";
>@@ -608,6 +609,8 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
> {
> pcidevs_lock();
> pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn);
>+if ( pdev )
>+pf_is_extfn = pdev->info.is_extfn;
> pcidevs_unlock();
> if ( !pdev )
> pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
>@@ -707,6 +710,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>seg, bus, slot, func, ctrl);
> }
> 
>+/* VF's 'is_extfn' is used to indicate whether PF is an extended function 
>*/
>+if ( pdev->info.is_virtfn )
>+pdev->info.is_extfn = pf_is_extfn;
> check_pdev(pdev);
> 
> ret = 0;
>diff --git a/xen/drivers/passthrough/vtd/dmar.c 
>b/xen/drivers/passthrough/vtd/dmar.c
>index 82040dd..75c9c92 100644
>--- a/xen/drivers/passthrough/vtd/dmar.c
>+++ b/xen/drivers/passthrough/vtd/dmar.c
>@@ -211,15 +211,15 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const 
>struct pci_dev *pdev)
> if ( pdev == NULL )
> return NULL;
> 
>-if ( pdev->info.is_extfn )
>+if ( pdev->info.is_virtfn )
> {
>-bus = pdev->bus;
>-devfn = 0;
>+bus = pdev->info.physfn.bus;
>+devfn = pdev->info.is_extfn ? 0 : pdev->info.physfn.devfn;
> }
>-else if ( pdev->info.is_virtfn )
>+else if ( pdev->info.is_extfn )
> {
>-bus = pdev->info.physfn.bus;
>-devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : 
>pdev->info.physfn.devfn;
>+bus = pdev->bus;
>+devfn = 0;
> }
> else
> {
>diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
>index 59b6e8a..ea86f9f 100644
>--- a/xen/include/xen/pci.h
>+++ b/xen/include/xen/pci.h
>@@ -39,6 +39,10 @@
> #define PCI_SBDF3(s,b,df) s) & 0x) << 16) | PCI_BDF2(b, df))
> 
> struct pci_dev_info {
>+/*
>+ * Considering VF's 'is_extfn' field isn't used, we reuse VF's 'is_extfn'
>+ * field to show whether the PF of this VF is an extended function.
>+ */
> bool_t is_extfn;
> bool_t is_virtfn;
> struct {
>-- 
>1.8.3.1
>

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] (no subject)

2017-08-25 Thread Chao Gao
From 3aa2541108f28cfdf0f3bf47ddae9b762b73b532 Mon Sep 17 00:00:00 2001
From: Chao Gao <chao@intel.com>
Date: Mon, 7 Aug 2017 04:50:04 +0800
Subject: [PATCH v9] VT-d: use correct BDF for VF to search VT-d unit

When SR-IOV is enabled, 'Virtual Functions' of a 'Physical Function' are under
the scope of the same VT-d unit as the 'Physical Function'. A 'Physical
Function' can be a 'Traditional Function' or an ARI 'Extended Function'.
And furthermore, 'Extended Functions' on an endpoint are under the scope of
the same VT-d unit as the 'Traditional Functions' on the endpoint. To search
VT-d unit, the BDF of PF or the BDF of a traditional function may be used. And
it depends on whether the PF is an extended function or not.

Current code uses PCI_SLOT() to recognize an ARI 'Extended Funcion'. But it
is conceptually wrong w/o checking whether PF is an extended function and
would lead to match VFs of a RC endpoint to a wrong VT-d unit.

This patch uses VF's 'is_extfn' field to indicate whether the PF of this VF is
an extended function. The field helps to use correct BDF to search VT-d unit.

Reported-by: Crawford, Eric R <eric.r.crawf...@intel.com>
Signed-off-by: Chao Gao <chao@intel.com>
---
v9:
 - check 'is_virtfn' first in pci_add_device() to avoid potential error if
 linux side sets VF's 'is_extfn'
 - comments changes to make it clear that we use VF's 'is_extfn' intentionally
 otherwise the patch seems like a workaround.

v8:
 - use "conceptually wrong", instead of "a corner case" in commit message
 - check 'is_virtfn' first in acpi_find_matched_drhd_unit()

v7:
 - Drop Eric's tested-by
 - Change commit message to be clearer
 - Re-use VF's is_extfn field
 - access PF's is_extfn field in locked area

---
 xen/drivers/passthrough/pci.c  | 14 ++
 xen/drivers/passthrough/vtd/dmar.c | 12 ++--
 xen/include/xen/pci.h  |  1 +
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 27bdb71..0e27e29 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -599,21 +599,24 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
 unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
 const char *pdev_type;
 int ret;
+bool pf_is_extfn = false;
 
-if (!info)
+if ( !info )
 pdev_type = "device";
-else if (info->is_extfn)
-pdev_type = "extended function";
-else if (info->is_virtfn)
+else if ( info->is_virtfn )
 {
 pcidevs_lock();
 pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn);
+if ( pdev )
+pf_is_extfn = pdev->info.is_extfn;
 pcidevs_unlock();
 if ( !pdev )
 pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
NULL, node);
 pdev_type = "virtual function";
 }
+else if ( info->is_extfn )
+pdev_type = "extended function";
 else
 {
 info = NULL;
@@ -707,6 +710,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
seg, bus, slot, func, ctrl);
 }
 
+/* VF's 'is_extfn' is used to indicate whether PF is an extended function 
*/
+if ( pdev->info.is_virtfn )
+pdev->info.is_extfn = pf_is_extfn;
 check_pdev(pdev);
 
 ret = 0;
diff --git a/xen/drivers/passthrough/vtd/dmar.c 
b/xen/drivers/passthrough/vtd/dmar.c
index 82040dd..75c9c92 100644
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -211,15 +211,15 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const 
struct pci_dev *pdev)
 if ( pdev == NULL )
 return NULL;
 
-if ( pdev->info.is_extfn )
+if ( pdev->info.is_virtfn )
 {
-bus = pdev->bus;
-devfn = 0;
+bus = pdev->info.physfn.bus;
+devfn = pdev->info.is_extfn ? 0 : pdev->info.physfn.devfn;
 }
-else if ( pdev->info.is_virtfn )
+else if ( pdev->info.is_extfn )
 {
-bus = pdev->info.physfn.bus;
-devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : 
pdev->info.physfn.devfn;
+bus = pdev->bus;
+devfn = 0;
 }
 else
 {
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 59b6e8a..4dd42ac 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -39,6 +39,7 @@
 #define PCI_SBDF3(s,b,df) s) & 0x) << 16) | PCI_BDF2(b, df))
 
 struct pci_dev_info {
+/* VF's 'is_extfn' is used to show whether its PF an extended function */
 bool_t is_extfn;
 bool_t is_virtfn;
 struct {
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v8] VT-d: use correct BDF for VF to search VT-d unit

2017-08-24 Thread Chao Gao
When SR-IOV is enabled, 'Virtual Functions' of a 'Physical Function' are under
the scope of the same VT-d unit as the 'Physical Function'. A 'Physical
Function' can be a 'Traditional Function' or an ARI 'Extended Function'.
And furthermore, 'Extended Functions' on an endpoint are under the scope of
the same VT-d unit as the 'Traditional Functions' on the endpoint. To search
VT-d unit, the BDF of PF or the BDF of a traditional function may be used. And
it depends on whether the PF is an extended function or not.

Current code uses PCI_SLOT() to recognize an ARI 'Extended Funcion'. But it
is conceptually wrong w/o checking whether PF is an extended function and
would lead to match VFs of a RC endpoint to a wrong VT-d unit.

This patch reuses 'is_extfn' field in VF's struct pci_dev_info to indicate
whether the PF of this VF is an extended function. The field helps to use
correct BDF to search VT-d unit.

Reported-by: Crawford, Eric R <eric.r.crawf...@intel.com>
Signed-off-by: Chao Gao <chao@intel.com>
---
v8:
 - use "conceptually wrong", instead of "a corner case" in commit message
 - check 'is_virtfn' first in acpi_find_matched_drhd_unit()

v7:
 - Drop Eric's tested-by
 - Change commit message to be clearer
 - Re-use VF's is_extfn field
 - access PF's is_extfn field in locked area

---
 xen/drivers/passthrough/pci.c  |  6 ++
 xen/drivers/passthrough/vtd/dmar.c | 12 ++--
 xen/include/xen/pci.h  |  4 
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 27bdb71..2a91320 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -599,6 +599,7 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
 unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
 const char *pdev_type;
 int ret;
+bool pf_is_extfn = false;
 
 if (!info)
 pdev_type = "device";
@@ -608,6 +609,8 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
 {
 pcidevs_lock();
 pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn);
+if ( pdev )
+pf_is_extfn = pdev->info.is_extfn;
 pcidevs_unlock();
 if ( !pdev )
 pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
@@ -707,6 +710,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
seg, bus, slot, func, ctrl);
 }
 
+/* VF's 'is_extfn' is used to indicate whether PF is an extended function 
*/
+if ( pdev->info.is_virtfn )
+pdev->info.is_extfn = pf_is_extfn;
 check_pdev(pdev);
 
 ret = 0;
diff --git a/xen/drivers/passthrough/vtd/dmar.c 
b/xen/drivers/passthrough/vtd/dmar.c
index 82040dd..75c9c92 100644
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -211,15 +211,15 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const 
struct pci_dev *pdev)
 if ( pdev == NULL )
 return NULL;
 
-if ( pdev->info.is_extfn )
+if ( pdev->info.is_virtfn )
 {
-bus = pdev->bus;
-devfn = 0;
+bus = pdev->info.physfn.bus;
+devfn = pdev->info.is_extfn ? 0 : pdev->info.physfn.devfn;
 }
-else if ( pdev->info.is_virtfn )
+else if ( pdev->info.is_extfn )
 {
-bus = pdev->info.physfn.bus;
-devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : 
pdev->info.physfn.devfn;
+bus = pdev->bus;
+devfn = 0;
 }
 else
 {
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 59b6e8a..ea86f9f 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -39,6 +39,10 @@
 #define PCI_SBDF3(s,b,df) s) & 0x) << 16) | PCI_BDF2(b, df))
 
 struct pci_dev_info {
+/*
+ * Considering VF's 'is_extfn' field isn't used, we reuse VF's 'is_extfn'
+ * field to show whether the PF of this VF is an extended function.
+ */
 bool_t is_extfn;
 bool_t is_virtfn;
 struct {
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH XEN] x86/pt: add a MSI unmask flag to XEN_DOMCTL_bind_pt_irq

2017-08-24 Thread Chao Gao
On Thu, Aug 24, 2017 at 04:17:32AM -0600, Jan Beulich wrote:
 On 24.08.17 at 12:12,  wrote:
>> On Thu, Aug 24, 2017 at 04:07:40AM -0600, Jan Beulich wrote:
>>> >>> On 24.08.17 at 11:47,  wrote:
>>> > @@ -438,6 +439,22 @@ int pt_irq_create_bind(
>>> >  pi_update_irte(vcpu ? >arch.hvm_vmx.pi_desc : NULL,
>>> > info, pirq_dpci->gmsi.gvec);
>>> >  
>>> > +if ( pt_irq_bind->u.msi.gflags & VMSI_UNMASKED )
>>> > +{
>>> > +struct irq_desc *desc = irq_to_desc(info->arch.irq);
>>> > +unsigned long flags;
>>> > +
>>> > +if ( !desc )
>>> > +{
>>> > +pt_irq_destroy_bind(d, pt_irq_bind);
>>> > +return -EINVAL;
>>> > +}
>>> > +
>>> > +spin_lock_irqsave(>lock, flags);
>>> > +guest_mask_msi_irq(desc, false);
>>> > +spin_unlock_irqrestore(>lock, flags);
>>> > +}
>>> > +
>>> >  break;
>>> >  }
>>> 
>>> I think you would better use pirq_spin_lock_irq_desc() here. And
>>> wouldn't the addition better be moved up a little (perhaps right
>>> after the dropping of the domain's event lock)?
>> 
>> Shouldn't the unmask happen after the posted interrupt is setup? Or it
>> doesn't really matter?
>> 
>> I though it was safer to unmask once the bind process was finished.
>
>Yeah, I'm not entirely certain either, hence I've put it as a question.
>Kevin, Chao?
>

Hi, Jan and Roger.

pi_update_irte() right above the piece of code is to set IRTE properly
according to the request. Unmasking the msi without updating IRTE, I
think may leads to inject an interrupt whose vector or destination is
out of date.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v7] VT-d: use correct BDF for VF to search VT-d unit

2017-08-24 Thread Chao Gao
On Thu, Aug 24, 2017 at 02:22:47AM -0600, Jan Beulich wrote:
 On 24.08.17 at 10:01,  wrote:
>>>  From: Tian, Kevin
>>> Sent: Thursday, August 24, 2017 3:22 PM
>>> 
>>> > From: Gao, Chao
>>> > Sent: Tuesday, August 22, 2017 5:52 AM
>>> >
>>> > When SR-IOV is enabled, 'Virtual Functions' of a 'Physical Function' are
>>> > under
>>> > the scope of the same VT-d unit as the 'Physical Function'. A 'Physical
>>> > Function' can be a 'Traditional Function' or an ARI 'Extended Function'.
>>> > And furthermore, 'Extended Functions' on an endpoint are under the
>>> scope
>>> > of
>>> > the same VT-d unit as the 'Traditional Functions' on the endpoint. To
>>> > search
>>> > VT-d unit, the BDF of PF or the BDF of a traditional function may be used.
>>> > And
>>> > it depends on whether the PF is an extended function or not.
>>> >
>>> > Current code uses PCI_SLOT() to recognize an ARI 'Extended Funcion'. But
>>> it
>>> > is problematic for a corner case (a RC endpoint with SRIOV capability
>>> 
>>> it's not a corner case. It's "conceptually wrong" w/o checking is_extfn.
>>> 
>>> > and has its own VT-d unit), leading to matching to a wrong VT-d unit.
>>> >
>>> > This patch reuses 'is_extfn' field in VF's struct pci_dev_info to indicate
>>> > whether the PF of this VF is an extended function. The field helps to use
>>> > correct BDF to search VT-d unit.
>>> 
>>> We should directly call "whether this VF is an extended function".
>>> 
>>> SR-IOV spec clearly says:
>>> 
>>> --
>>> The ARI capability enables a Device to support up to 256 Functions -
>>> Functions, PFs, or VFs in any combination - associated with the
>>> captured Bus Number.
>>> --
>>> 
>>> So a VF with function number >7 is also an extended function.
>>> 
>> 
>> Had a discussion with Chao. My previous understanding looks
>> not accurate. From VT-d spec:
>> 
>> 1) VF is under the scope of the same VT-d as the PF
>> 
>> 2) if PF is extended function, it is under the scope of the same
>> VT-d as the traditional functions on the endpoint.
>> 
>> Above applies to any VF requestor ID (including <=7), so when setting
>> is_extfn for a VF, it really doesn't mean VF is an extended function.
>> Instead it always refers to the PF attribute. Then let's still add the
>> original comment to mark it out.
>> 
>> Based on that, possibly below logic can better match above policy:
>> 
>> if ( pdev->info.is_virtfn )
>> {
>>  bus = pdev->info.physfn.bus;
>>  devfn = pdev->info.is_extfn ? 0 : pdev->info.physfn.devfn;
>
>But that's not in line with what you say above: You look at the
>VF's is_extfn here instead of at the PF's one. I.e. that would
>only be correct if the PF's flag got propagated to all its VFs,
>which I think earlier discussion had ruled out as an option (as
>that would depend on the current, assumed buggy, behavior
>of the corresponding Linux code to remain unchanged). Or the

I think Kevin did agree to this solution: propageting PF's is_extfn to
all its VF (namely, reuse VF's is_extfn to show whether PF is an
extended function or not). And the sample code may be more
straightforward than Roger's proposal as it can be easily matched to the
rules metioned above.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v7] VT-d: use correct BDF for VF to search VT-d unit

2017-08-23 Thread Chao Gao
On Wed, Aug 23, 2017 at 09:01:07AM +0100, Roger Pau Monné wrote:
>On Wed, Aug 23, 2017 at 02:46:08PM +0800, Chao Gao wrote:
>> On Wed, Aug 23, 2017 at 08:31:51AM +0100, Roger Pau Monné wrote:
>> >On Wed, Aug 23, 2017 at 01:20:13AM -0600, Jan Beulich wrote:
>> >> >>> On 23.08.17 at 09:16, <roger@citrix.com> wrote:
>> >> > On Wed, Aug 23, 2017 at 09:05:14AM +0800, Chao Gao wrote:
>> >> >> On Tue, Aug 22, 2017 at 06:43:49AM -0600, Jan Beulich wrote:
>> >> >> >>>> On 21.08.17 at 23:52, <chao@intel.com> wrote:
>> >> >> >> --- a/xen/include/xen/pci.h
>> >> >> >> +++ b/xen/include/xen/pci.h
>> >> >> >> @@ -39,6 +39,10 @@
>> >> >> >>  #define PCI_SBDF3(s,b,df) s) & 0x) << 16) | PCI_BDF2(b, 
>> >> >> >> df))
>> >> >> >>  
>> >> >> >>  struct pci_dev_info {
>> >> >> >> +/*
>> >> >> >> + * When 'is_virtfn' is set, 'is_extfn' is re-used to indicate 
>> >> >> >> whether
>> >> >> >> + * the PF of this VF is an extended function.
>> >> >> >> + */
>> >> >> >
>> >> >> >I'd be inclined to extend the comment by appending ", as a VF itself
>> >> >> >can never be an extended function." Is that correct? If so, would
>> >> >> 
>> >> >> Hi, Jan and Roger.
>> >> >> 
>> >> >> Strictly speaking, the VF can be an extended function. The definition 
>> >> >> is
>> >> >> within ARI device (in this kind of device, device field is treated as 
>> >> >> an
>> >> >> extension of function number) and function number is greater than 7. 
>> >> >> But
>> >> >> this field isn't used as we don't care about whether a VF is or not an
>> >> >> extended function (at least at present).
>> >> >> 
>> >> >> Eric reviewed this patch and told me we may match
>> >> >> 'if ( pdev->info.is_extfn )' in acpi_find_matched_drhd_unit.
>> >> >> So we may introduce a new field like what I do in v6 or check
>> >> >> 'pdev->info.is_virtfn' first in acpi_find_matched_drhd_unit (maybe 
>> >> >> other
>> >> >> places we check pdev->info.is_extfn).
>> >> >> 
>> >> >> Which one do you prefer?
>> >> > 
>> >> > Looking at this again I'm not sure why you need any modifications to
>> >> > acpi_find_matched_drhd_unit. If the virtual function is an extended
>> >> > function pdev->bus should be equal to pdev->info.physfn.bus, in which
>> >> > case the already existing is_extfn check will already DTRT?
>> >> > 
>> >> > Ie: an extended VF should always have the same bus as the PF it
>> >> > belongs to, unless I'm missing something.
>> >> 
>> >> Why would that be?
>> >
>> >It is my understanding (which might be wrong), that an extended
>> >function simply uses 8 bits for the function number, which on a
>> >traditional device would be used for both the slot and the function
>> >number.
>> >
>> >So extended functions have no slot, but the bus number is the same for
>> >all of them, or else they would belong to different devices due to the
>> >difference in the bus numbers.
>> >
>> >Maybe what I'm missing is whether it is possible to have a device with
>> >virtual functions that expand across several buses?
>> 
>> It is not true. Please refer to the 2.1.2 VF Discovery of SR-IOV spec.
>> The numbers of VF can be larger than 256 and so it is definite that
>> sometimes VF's bus number would be different from the PF's.
>
>So that's what I was missing, thanks.
>
>Then I would modify acpi_find_matched_drhd_unit so it's:
>
>if ( pdev->info.is_extfn )
>{
>bus = pdev->info.is_virtfn ? pdev->info.physfn.bus : pdev->bus;
>devfn = 0;
>}
>
>AFAICT that should work?

Fine to me.

Jan, What your opinion on this piece of code?

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v7] VT-d: use correct BDF for VF to search VT-d unit

2017-08-23 Thread Chao Gao
On Wed, Aug 23, 2017 at 02:04:24AM -0600, Jan Beulich wrote:
 On 23.08.17 at 03:05,  wrote:
>> Strictly speaking, the VF can be an extended function. The definition is
>> within ARI device (in this kind of device, device field is treated as an
>> extension of function number) and function number is greater than 7. But
>> this field isn't used as we don't care about whether a VF is or not an
>> extended function (at least at present).
>
>Hmm, that's not in line with what Linux'es xen_add_device() does:
>
>#ifdef CONFIG_PCI_IOV
>   if (pci_dev->is_virtfn) {
>   add->flags = XEN_PCI_DEV_VIRTFN;
>   add->physfn.bus = physfn->bus->number;
>   add->physfn.devfn = physfn->devfn;
>   } else
>#endif
>   if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn))
>   add->flags = XEN_PCI_DEV_EXTFN;
>
>Note the "else" in there. Are you saying this is actually wrong? (I
>indeed do see ARI capability structures in the VFs of the one
>SR-IOV capable system I have direct access to.)

Yes. I think it is wrong. Considering no one in Xen needs this
information, don't set XEN_PCI_DEV_EXTFN for VF is acceptable.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v7] VT-d: use correct BDF for VF to search VT-d unit

2017-08-23 Thread Chao Gao
On Wed, Aug 23, 2017 at 08:31:51AM +0100, Roger Pau Monné wrote:
>On Wed, Aug 23, 2017 at 01:20:13AM -0600, Jan Beulich wrote:
>> >>> On 23.08.17 at 09:16, <roger@citrix.com> wrote:
>> > On Wed, Aug 23, 2017 at 09:05:14AM +0800, Chao Gao wrote:
>> >> On Tue, Aug 22, 2017 at 06:43:49AM -0600, Jan Beulich wrote:
>> >> >>>> On 21.08.17 at 23:52, <chao@intel.com> wrote:
>> >> >> --- a/xen/include/xen/pci.h
>> >> >> +++ b/xen/include/xen/pci.h
>> >> >> @@ -39,6 +39,10 @@
>> >> >>  #define PCI_SBDF3(s,b,df) s) & 0x) << 16) | PCI_BDF2(b, df))
>> >> >>  
>> >> >>  struct pci_dev_info {
>> >> >> +/*
>> >> >> + * When 'is_virtfn' is set, 'is_extfn' is re-used to indicate 
>> >> >> whether
>> >> >> + * the PF of this VF is an extended function.
>> >> >> + */
>> >> >
>> >> >I'd be inclined to extend the comment by appending ", as a VF itself
>> >> >can never be an extended function." Is that correct? If so, would
>> >> 
>> >> Hi, Jan and Roger.
>> >> 
>> >> Strictly speaking, the VF can be an extended function. The definition is
>> >> within ARI device (in this kind of device, device field is treated as an
>> >> extension of function number) and function number is greater than 7. But
>> >> this field isn't used as we don't care about whether a VF is or not an
>> >> extended function (at least at present).
>> >> 
>> >> Eric reviewed this patch and told me we may match
>> >> 'if ( pdev->info.is_extfn )' in acpi_find_matched_drhd_unit.
>> >> So we may introduce a new field like what I do in v6 or check
>> >> 'pdev->info.is_virtfn' first in acpi_find_matched_drhd_unit (maybe other
>> >> places we check pdev->info.is_extfn).
>> >> 
>> >> Which one do you prefer?
>> > 
>> > Looking at this again I'm not sure why you need any modifications to
>> > acpi_find_matched_drhd_unit. If the virtual function is an extended
>> > function pdev->bus should be equal to pdev->info.physfn.bus, in which
>> > case the already existing is_extfn check will already DTRT?
>> > 
>> > Ie: an extended VF should always have the same bus as the PF it
>> > belongs to, unless I'm missing something.
>> 
>> Why would that be?
>
>It is my understanding (which might be wrong), that an extended
>function simply uses 8 bits for the function number, which on a
>traditional device would be used for both the slot and the function
>number.
>
>So extended functions have no slot, but the bus number is the same for
>all of them, or else they would belong to different devices due to the
>difference in the bus numbers.
>
>Maybe what I'm missing is whether it is possible to have a device with
>virtual functions that expand across several buses?

It is not true. Please refer to the 2.1.2 VF Discovery of SR-IOV spec.
The numbers of VF can be larger than 256 and so it is definite that
sometimes VF's bus number would be different from the PF's.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v7] VT-d: use correct BDF for VF to search VT-d unit

2017-08-22 Thread Chao Gao
On Tue, Aug 22, 2017 at 06:43:49AM -0600, Jan Beulich wrote:
 On 21.08.17 at 23:52,  wrote:
>> --- a/xen/include/xen/pci.h
>> +++ b/xen/include/xen/pci.h
>> @@ -39,6 +39,10 @@
>>  #define PCI_SBDF3(s,b,df) s) & 0x) << 16) | PCI_BDF2(b, df))
>>  
>>  struct pci_dev_info {
>> +/*
>> + * When 'is_virtfn' is set, 'is_extfn' is re-used to indicate whether
>> + * the PF of this VF is an extended function.
>> + */
>
>I'd be inclined to extend the comment by appending ", as a VF itself
>can never be an extended function." Is that correct? If so, would

Hi, Jan and Roger.

Strictly speaking, the VF can be an extended function. The definition is
within ARI device (in this kind of device, device field is treated as an
extension of function number) and function number is greater than 7. But
this field isn't used as we don't care about whether a VF is or not an
extended function (at least at present).

Eric reviewed this patch and told me we may match
'if ( pdev->info.is_extfn )' in acpi_find_matched_drhd_unit.
So we may introduce a new field like what I do in v6 or check
'pdev->info.is_virtfn' first in acpi_find_matched_drhd_unit (maybe other
places we check pdev->info.is_extfn).

Which one do you prefer?

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v7] VT-d: use correct BDF for VF to search VT-d unit

2017-08-22 Thread Chao Gao
On Tue, Aug 22, 2017 at 08:29:58AM +0100, Roger Pau Monné wrote:
>On Tue, Aug 22, 2017 at 05:52:04AM +0800, Chao Gao wrote:
>> When SR-IOV is enabled, 'Virtual Functions' of a 'Physical Function' are 
>> under
>> the scope of the same VT-d unit as the 'Physical Function'. A 'Physical
>> Function' can be a 'Traditional Function' or an ARI 'Extended Function'.
>> And furthermore, 'Extended Functions' on an endpoint are under the scope of
>> the same VT-d unit as the 'Traditional Functions' on the endpoint. To search
>> VT-d unit, the BDF of PF or the BDF of a traditional function may be used. 
>> And
>> it depends on whether the PF is an extended function or not.
>> 
>> Current code uses PCI_SLOT() to recognize an ARI 'Extended Funcion'. But it
>> is problematic for a corner case (a RC endpoint with SRIOV capability
>> and has its own VT-d unit), leading to matching to a wrong VT-d unit.
>> 
>> This patch reuses 'is_extfn' field in VF's struct pci_dev_info to indicate
>> whether the PF of this VF is an extended function. The field helps to use
>> correct BDF to search VT-d unit.
>> 
>> Reported-by: Crawford, Eric R <eric.r.crawf...@intel.com>
>> Signed-off-by: Chao Gao <chao@intel.com>
>
>This looks fine to me:
>
>Reviewed-by: Roger Pau Monné <roger@citrix.com>
>

Thank you, Roger.

>Given the issues we had before with this commit, could we please have
>a Tested-by by someone? I saw that you dropped Eric's, and I would
>like to have it again.

Hi, Eric.

Could you test this patch again and give this patch your Tested-by if it
passes your test?

Thanks
Chao


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v6] VT-d: fix VF of RC integrated PF matched to wrong VT-d unit

2017-08-22 Thread Chao Gao
On Thu, Aug 17, 2017 at 03:43:07PM +0800, Tian, Kevin wrote:
>> From: Gao, Chao
>> Sent: Wednesday, August 16, 2017 1:12 PM
>> 
>> The problem is for a VF of RC integrated PF (e.g. PF's BDF is
>> 00:02.0), we would wrongly use 00:00.0 to search VT-d unit.
>> 
>> If a PF is an extended function, the BDF of a traditional function
>> within the same device should be used to search VT-d unit. Otherwise,
>> the real BDF of PF should be used. According PCI-e spec, an extended
>> function is a function within an ARI device and Function Number is
>> greater than 7. The original code tried to tell apart Extended
>> Function and non-Extended Function through checking PCI_SLOT(),
>> missing counterpart of pci_ari_enabled() (this function exists in
>> linux kernel) compared to linux kernel. Without checking whether ARI
>> is enabled, it incurs a RC integrated PF with PCI_SLOT() >0 is wrongly
>> classified to an extended function. Note that a RC integrated function
>> isn't within an ARI device and thus cannot be extended function and in
>> this case the real BDF should be used.
>> 
>> This patch introduces a new field, pf_is_extfn, in struct
>> pci_dev_info, to indicate whether the physical function is an extended
>> function. The new field helps to generate correct BDF to search VT-d
>> unit.
>> 
>> Reported-by: Crawford, Eric R <eric.r.crawf...@intel.com>
>> Tested-by: Crawford, Eric R <eric.r.crawf...@intel.com>
>> Signed-off-by: Chao Gao <chao@intel.com>
>> ---
>>  xen/drivers/passthrough/pci.c  | 6 +-
>>  xen/drivers/passthrough/vtd/dmar.c | 2 +-
>>  xen/include/xen/pci.h  | 1 +
>>  3 files changed, 7 insertions(+), 2 deletions(-)
>> 
>> diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
>> index 27bdb71..8c2ba33 100644
>> --- a/xen/drivers/passthrough/pci.c
>> +++ b/xen/drivers/passthrough/pci.c
>> @@ -599,6 +599,7 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>>  unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
>>  const char *pdev_type;
>>  int ret;
>> +bool pf_is_extfn = false;
>> 
>>  if (!info)
>>  pdev_type = "device";
>> @@ -609,7 +610,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
>>  pcidevs_lock();
>>  pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn);
>>  pcidevs_unlock();
>> -if ( !pdev )
>> +if ( pdev )
>> +pf_is_extfn = pdev->info.is_extfn;
>
>besides Roger's comment, can you move above 2 lines inside lock
>protection?
>

Hi, Kevin and Roger.

I sent out a new version recently. The new version adopts all your
suggestions. Please review it.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v7] VT-d: use correct BDF for VF to search VT-d unit

2017-08-21 Thread Chao Gao
When SR-IOV is enabled, 'Virtual Functions' of a 'Physical Function' are under
the scope of the same VT-d unit as the 'Physical Function'. A 'Physical
Function' can be a 'Traditional Function' or an ARI 'Extended Function'.
And furthermore, 'Extended Functions' on an endpoint are under the scope of
the same VT-d unit as the 'Traditional Functions' on the endpoint. To search
VT-d unit, the BDF of PF or the BDF of a traditional function may be used. And
it depends on whether the PF is an extended function or not.

Current code uses PCI_SLOT() to recognize an ARI 'Extended Funcion'. But it
is problematic for a corner case (a RC endpoint with SRIOV capability
and has its own VT-d unit), leading to matching to a wrong VT-d unit.

This patch reuses 'is_extfn' field in VF's struct pci_dev_info to indicate
whether the PF of this VF is an extended function. The field helps to use
correct BDF to search VT-d unit.

Reported-by: Crawford, Eric R <eric.r.crawf...@intel.com>
Signed-off-by: Chao Gao <chao@intel.com>
---
v7:
 - Drop Eric's tested-by
 - Change commit message to be clearer
 - Re-use VF's is_extfn field
 - access PF's is_extfn field in locked area
---
 xen/drivers/passthrough/pci.c  | 6 ++
 xen/drivers/passthrough/vtd/dmar.c | 2 +-
 xen/include/xen/pci.h  | 4 
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 27bdb71..2a91320 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -599,6 +599,7 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
 unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
 const char *pdev_type;
 int ret;
+bool pf_is_extfn = false;
 
 if (!info)
 pdev_type = "device";
@@ -608,6 +609,8 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
 {
 pcidevs_lock();
 pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn);
+if ( pdev )
+pf_is_extfn = pdev->info.is_extfn;
 pcidevs_unlock();
 if ( !pdev )
 pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
@@ -707,6 +710,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
seg, bus, slot, func, ctrl);
 }
 
+/* VF's 'is_extfn' is used to indicate whether PF is an extended function 
*/
+if ( pdev->info.is_virtfn )
+pdev->info.is_extfn = pf_is_extfn;
 check_pdev(pdev);
 
 ret = 0;
diff --git a/xen/drivers/passthrough/vtd/dmar.c 
b/xen/drivers/passthrough/vtd/dmar.c
index 82040dd..83ce5d4 100644
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -219,7 +219,7 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const 
struct pci_dev *pdev)
 else if ( pdev->info.is_virtfn )
 {
 bus = pdev->info.physfn.bus;
-devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : 
pdev->info.physfn.devfn;
+devfn = pdev->info.is_extfn ? 0 : pdev->info.physfn.devfn;
 }
 else
 {
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 59b6e8a..3b0da66 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -39,6 +39,10 @@
 #define PCI_SBDF3(s,b,df) s) & 0x) << 16) | PCI_BDF2(b, df))
 
 struct pci_dev_info {
+/*
+ * When 'is_virtfn' is set, 'is_extfn' is re-used to indicate whether
+ * the PF of this VF is an extended function.
+ */
 bool_t is_extfn;
 bool_t is_virtfn;
 struct {
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V2 9/25] tools/libxl: build DMAR table for a guest with one virtual VTD

2017-08-17 Thread Chao Gao
On Thu, Aug 17, 2017 at 01:28:21PM +0100, Wei Liu wrote:
>On Thu, Aug 17, 2017 at 12:32:17PM +0100, Wei Liu wrote:
>> On Wed, Aug 09, 2017 at 04:34:10PM -0400, Lan Tianyu wrote:
>> > diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
>> > index f54fd49..94c9196 100644
>> > --- a/tools/libxl/libxl_dom.c
>> > +++ b/tools/libxl/libxl_dom.c
>> > @@ -1060,6 +1060,42 @@ static int libxl__domain_firmware(libxl__gc *gc,
>> >  }
>> >  }
>> >  
>> > +/*
>> > + * If a guest has one virtual VTD, build DMAR table for it and joint 
>> > this
>> > + * table with existing content in acpi_modules in order to employ HVM
>> > + * firmware pass-through mechanism to pass-through DMAR table.
>> > + */
>> > +if (info->viommu.type == LIBXL_VIOMMU_TYPE_INTEL_VTD) {
>> > +datalen = 0;
>> > +e = libxl__dom_build_dmar(gc, info, dom, , );
>> > +if (e) {
>> > +LOGEV(ERROR, e, "failed to build DMAR table");
>> > +rc = ERROR_FAIL;
>> > +goto out;
>> > +}
>> > +if (datalen) {
>> > +libxl__ptr_add(gc, data);
>> > +if (!dom->acpi_modules[0].data) {
>> > +dom->acpi_modules[0].data = data;
>> > +dom->acpi_modules[0].length = (uint32_t)datalen;
>> > +} else {
>> > +/* joint tables */
>> > +void *newdata;
>> > +newdata = malloc(datalen + dom->acpi_modules[0].length);
>> 
>> All memory allocations in libxl should use libxl__*lloc wrappers.
>> 
>> > +if (!newdata) {
>> > +LOGE(ERROR, "failed to joint DMAR table to acpi 
>> > modules");
>> > +rc = ERROR_FAIL;
>> > +goto out;
>> > +}
>> > +memcpy(newdata, dom->acpi_modules[0].data,
>> > +   dom->acpi_modules[0].length);
>> > +memcpy(newdata + dom->acpi_modules[0].length, data, 
>> > datalen);
>> > +dom->acpi_modules[0].data = newdata;
>> > +dom->acpi_modules[0].length += (uint32_t)datalen;
>
>Also, this leaks the old pointer, right?

Yes. Will fix this.

>
>> > +}
>> > +}
>> > +}
>> 
>> This still looks wrong to me. How do you know acpi_modules[0] is DMAR
>> table?
>> 
>
>Oh, I sorta see why you do this, but I still think this is wrong. The
>DMAR should either be a new module or be joined to the existing one (and
>with all conflicts resolved).

Hi, Wei
Thanks for your comments.

iirc, HVM only supports one module; DMAR cannot be a new module. Joining to
the existing one is the approach we are taking. 

Which kind of conflicts you think should be resolved? If you mean I
forget to free the old buf, I will fix this. If you mean the potential
overlap between the binary passed by admin and DMAR table built here, I
don't have much idea on this. Even without the DMAR table, the binary
may contains MADT or other tables and tool stacks don't intrepret the
binary and check whether there are conflicts, right?

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [patch v6] vt-d: fix vf of rc integrated pf matched to wrong vt-d unit

2017-08-16 Thread Chao Gao
On Wed, Aug 16, 2017 at 09:17:46AM +0100, Roger Pau Monné wrote:
>on wed, aug 16, 2017 at 01:12:24pm +0800, chao gao wrote:
>> the problem is for a vf of rc integrated pf (e.g. pf's bdf is
>> 00:02.0), we would wrongly use 00:00.0 to search vt-d unit.
>> 
>> if a pf is an extended function, the bdf of a traditional function
>> within the same device should be used to search vt-d unit. otherwise,
>> the real bdf of pf should be used. according pci-e spec, an extended
>> function is a function within an ari device and function number is
>> greater than 7.
>
>AFAIK, extended functions simply remove the slot and extend the
>function number to [0, 255], so it seems correct to expect that the
>VT-d unit search should be done using the bus and extended function
>parameters, and assume slot is 0. Is this some kind of limitation of
>VT-d?

VT-d spec makes such provision for VT-d unit search without any
explaination. But I think it isn't. Whether we can find the right VT-d unit
depends on DMAR. So I would rather regard it as firmware doesn't prepare
entries for extended functions in DMAR.

>
>> The original code tried to tell apart Extended
>> Function and non-Extended Function through checking PCI_SLOT(),
>> missing counterpart of pci_ari_enabled() (this function exists in
>> linux kernel) compared to linux kernel. Without checking whether ARI
>> is enabled, it incurs a RC integrated PF with PCI_SLOT() >0 is wrongly
>> classified to an extended function. Note that a RC integrated function
>> isn't within an ARI device and thus cannot be extended function and in
>> this case the real BDF should be used.
>> 
>> This patch introduces a new field, pf_is_extfn, in struct
>> pci_dev_info, to indicate whether the physical function is an extended
>> function. The new field helps to generate correct BDF to search VT-d
>> unit.
>
>[...]
>> diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
>> index 59b6e8a..9e76aa0 100644
>> --- a/xen/include/xen/pci.h
>> +++ b/xen/include/xen/pci.h
>> @@ -40,6 +40,7 @@
>>  
>>  struct pci_dev_info {
>>  bool_t is_extfn;
>> +bool_t pf_is_extfn; /* Only valid for virtual function */
>
>Can't you just re-use is_virtfn and is_extfn, and when both are true
>it means the pf where this vf belongs is an extended function?

Yes. Reuse vf's is_extfn field is possible.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v5 4/4] xentrace: add support for HVM's PI blocking list operation

2017-08-15 Thread Chao Gao
In order to analyze PI blocking list operation frequence and obtain
the list length, add some relevant events to xentrace and some
associated code in xenalyze.

Signed-off-by: Chao Gao <chao@intel.com>
---
v5:
 - Put pi list operation under HW events and get rid of ASYNC stuff
 - generate scatterplot of pi list length on pcpus to be vivid to
 analyst.
v4:
 - trace part of Patch 1 in v3

---
 tools/xentrace/formats |   2 +
 tools/xentrace/xenalyze.c  | 116 +
 xen/arch/x86/hvm/vmx/vmx.c |  17 ++-
 xen/include/public/trace.h |   5 ++
 4 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/tools/xentrace/formats b/tools/xentrace/formats
index c1f584f..e926a18 100644
--- a/tools/xentrace/formats
+++ b/tools/xentrace/formats
@@ -205,6 +205,8 @@
 0x00802006  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  assign_vector [ irq = %(1)d = 
vector 0x%(2)x, CPU mask: 0x%(3)08x ]
 0x00802007  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  bogus_vector [ 0x%(1)x ]
 0x00802008  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  do_irq [ irq = %(1)d, began = 
%(2)dus, ended = %(3)dus ]
+0x00804001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  pi_list_add [ domid = 
0x%(1)04x vcpu = 0x%(2)04x, pcpu = 0x%(3)04x, #entry = 0x%(4)04x ]
+0x00804002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  pi_list_del [ domid = 
0x%(1)04x vcpu = 0x%(2)04x ]
 
 0x00084001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  hpet create [ tn = %(1)d, irq 
= %(2)d, delta = 0x%(4)08x%(3)08x, period = 0x%(6)08x%(5)08x ]
 0x00084002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  pit create [ delta = 
0x%(1)016x, period = 0x%(2)016x ]
diff --git a/tools/xentrace/xenalyze.c b/tools/xentrace/xenalyze.c
index 24cce2a..2276a23 100644
--- a/tools/xentrace/xenalyze.c
+++ b/tools/xentrace/xenalyze.c
@@ -159,6 +159,7 @@ struct {
 scatterplot_extint_cycles:1,
 scatterplot_rdtsc:1,
 scatterplot_irq:1,
+scatterplot_pi_list:1,
 histogram_interrupt_eip:1,
 interval_mode:1,
 dump_all:1,
@@ -233,6 +234,7 @@ struct {
 .scatterplot_extint_cycles=0,
 .scatterplot_rdtsc=0,
 .scatterplot_irq=0,
+.scatterplot_pi_list=0,
 .histogram_interrupt_eip=0,
 .dump_all = 0,
 .dump_raw_process = 0,
@@ -1391,6 +1393,9 @@ struct hvm_data {
 
 /* Historical info */
 tsc_t last_rdtsc;
+
+/* Destination pcpu of posted interrupt's wakeup interrupt */
+int pi_cpu;
 };
 
 enum {
@@ -1457,6 +1462,8 @@ void init_hvm_data(struct hvm_data *h, struct vcpu_data 
*v) {
 }
 for(i=0; i<GUEST_INTERRUPT_MAX+1; i++)
 h->summary.guest_interrupt[i].count=0;
+
+h->pi_cpu = -1;
 }
 
 /* PV data */
@@ -1852,6 +1859,9 @@ struct pcpu_info {
 tsc_t tsc;
 struct cycle_summary idle, running, lost;
 } time;
+
+/* Posted Interrupt List Length */
+int pi_list_length;
 };
 
 void __fill_in_record_info(struct pcpu_info *p);
@@ -8581,8 +8591,97 @@ void irq_process(struct pcpu_info *p) {
 }
 }
 
+static void process_pi_list_add(struct record_info *ri)
+{
+struct {
+int did;
+int vid;
+int pcpu;
+int len;
+} *data = (typeof(data))ri->d;
+struct vcpu_data *v;
+
+v = vcpu_find(data->did, data->vid);
+if ( !v->hvm.init )
+init_hvm_data(>hvm, v);
+
+if ( opt.dump_all )
+printf("d%uv%u is added to pi blocking list of pcpu%u. "
+   "The list length is now %d\n",
+   data->did, data->vid, data->pcpu, data->len);
+
+v->hvm.pi_cpu = data->pcpu;
+/* Calibrate pi list length */
+P.pcpu[data->pcpu].pi_list_length = data->len;
+
+if ( opt.scatterplot_pi_list )
+{
+struct time_struct t;
+
+abs_cycles_to_time(ri->tsc, );
+printf("%d %u.%09u %d\n", data->pcpu, t.s, t.ns,
+   P.pcpu[data->pcpu].pi_list_length);
+}
+}
+
+static void process_pi_list_del(struct record_info *ri)
+{
+struct {
+int did;
+int vid;
+} *data = (typeof(data))ri->d;
+struct vcpu_data *v;
+
+v = vcpu_find(data->did, data->vid);
+if ( !v->hvm.init )
+init_hvm_data(>hvm, v);
+
+if ( opt.dump_all )
+{
+if ( v->hvm.pi_cpu != -1 )
+printf("d%uv%u is removed from pi blocking list of pcpu%u\n",
+   data->did, data->vid, v->hvm.pi_cpu);
+else
+printf("d%uv%u is removed from pi blocking list\n",
+   data->did, data->vid);
+}
+
+if ( (v->hvm.pi_cpu != -1) && (P.pcpu[v->hvm.pi_cpu].pi_list_length != -1) 
)
+{
+P.pcpu[v->hvm.pi_cpu].pi_list_length--;
+
+if ( opt.scatterplot_pi_list )
+{
+struct time_struct t;
+
+abs_cycles_to_time(ri->tsc, );
+printf("%d %u.%09u %d\n", v->hvm.pi_cpu, t.s, t.ns,
+  

[Xen-devel] [PATCH v5 1/4] VT-d PI: track the number of vcpus on pi blocking list

2017-08-15 Thread Chao Gao
This patch adds a field, counter, in struct vmx_pi_blocking_vcpu to track
how many entries are on the pi blocking list.

Signed-off-by: Chao Gao <chao@intel.com>
---
v5:
 - introduce two functions for adding or removing vcpus from pi blocking list.
 - check the sanity of vcpu count on pi blocking list
v4:
 - non-trace part of Patch 1 in v3

---
 xen/arch/x86/hvm/vmx/vmx.c | 42 --
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 67fc85b..bf17988 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -83,6 +83,7 @@ static int vmx_vmfunc_intercept(struct cpu_user_regs *regs);
 struct vmx_pi_blocking_vcpu {
 struct list_head list;
 spinlock_t   lock;
+unsigned int counter;
 };
 
 /*
@@ -100,6 +101,24 @@ void vmx_pi_per_cpu_init(unsigned int cpu)
 spin_lock_init(_cpu(vmx_pi_blocking, cpu).lock);
 }
 
+static void vmx_pi_add_vcpu(struct pi_blocking_vcpu *pbv,
+struct vmx_pi_blocking_vcpu *vpbv)
+{
+ASSERT(spin_is_locked(>lock));
+add_sized(>counter, 1);
+ASSERT(read_atomic(>counter));
+list_add_tail(>list, >list);
+}
+
+static void vmx_pi_del_vcpu(struct pi_blocking_vcpu *pbv,
+struct vmx_pi_blocking_vcpu *vpbv)
+{
+ASSERT(spin_is_locked(>lock));
+ASSERT(read_atomic(>counter));
+list_del(>list);
+add_sized(>counter, -1);
+}
+
 static void vmx_vcpu_block(struct vcpu *v)
 {
 unsigned long flags;
@@ -120,8 +139,8 @@ static void vmx_vcpu_block(struct vcpu *v)
  */
 ASSERT(old_lock == NULL);
 
-list_add_tail(>arch.hvm_vmx.pi_blocking.list,
-  _cpu(vmx_pi_blocking, v->processor).list);
+vmx_pi_add_vcpu(>arch.hvm_vmx.pi_blocking,
+_cpu(vmx_pi_blocking, v->processor));
 spin_unlock_irqrestore(pi_blocking_list_lock, flags);
 
 ASSERT(!pi_test_sn(pi_desc));
@@ -186,7 +205,9 @@ static void vmx_pi_unblock_vcpu(struct vcpu *v)
 if ( v->arch.hvm_vmx.pi_blocking.lock != NULL )
 {
 ASSERT(v->arch.hvm_vmx.pi_blocking.lock == pi_blocking_list_lock);
-list_del(>arch.hvm_vmx.pi_blocking.list);
+vmx_pi_del_vcpu(>arch.hvm_vmx.pi_blocking,
+container_of(pi_blocking_list_lock,
+ struct vmx_pi_blocking_vcpu, lock));
 v->arch.hvm_vmx.pi_blocking.lock = NULL;
 }
 
@@ -234,7 +255,7 @@ void vmx_pi_desc_fixup(unsigned int cpu)
  */
 if ( pi_test_on(>pi_desc) )
 {
-list_del(>pi_blocking.list);
+vmx_pi_del_vcpu(>pi_blocking, _cpu(vmx_pi_blocking, cpu));
 vmx->pi_blocking.lock = NULL;
 vcpu_unblock(container_of(vmx, struct vcpu, arch.hvm_vmx));
 }
@@ -257,8 +278,9 @@ void vmx_pi_desc_fixup(unsigned int cpu)
 write_atomic(>pi_desc.ndst,
  x2apic_enabled ? dest : MASK_INSR(dest, 
PI_xAPIC_NDST_MASK));
 
-list_move(>pi_blocking.list,
-  _cpu(vmx_pi_blocking, new_cpu).list);
+vmx_pi_del_vcpu(>pi_blocking, _cpu(vmx_pi_blocking, cpu));
+vmx_pi_add_vcpu(>pi_blocking, _cpu(vmx_pi_blocking,
+new_cpu));
 vmx->pi_blocking.lock = new_lock;
 
 spin_unlock(new_lock);
@@ -2351,9 +2373,9 @@ static struct hvm_function_table __initdata 
vmx_function_table = {
 static void pi_wakeup_interrupt(struct cpu_user_regs *regs)
 {
 struct arch_vmx_struct *vmx, *tmp;
-spinlock_t *lock = _cpu(vmx_pi_blocking, smp_processor_id()).lock;
-struct list_head *blocked_vcpus =
-   _cpu(vmx_pi_blocking, smp_processor_id()).list;
+unsigned int cpu = smp_processor_id();
+spinlock_t *lock = _cpu(vmx_pi_blocking, cpu).lock;
+struct list_head *blocked_vcpus = _cpu(vmx_pi_blocking, cpu).list;
 
 ack_APIC_irq();
 this_cpu(irq_count)++;
@@ -2369,7 +2391,7 @@ static void pi_wakeup_interrupt(struct cpu_user_regs 
*regs)
 {
 if ( pi_test_on(>pi_desc) )
 {
-list_del(>pi_blocking.list);
+vmx_pi_del_vcpu(>pi_blocking, _cpu(vmx_pi_blocking, cpu));
 ASSERT(vmx->pi_blocking.lock == lock);
 vmx->pi_blocking.lock = NULL;
 vcpu_unblock(container_of(vmx, struct vcpu, arch.hvm_vmx));
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v5 0/4] mitigate the per-pCPU blocking list may be too long

2017-08-15 Thread Chao Gao
Changes in v5:
 - In patch 1, add check the sanity of vcpus count on pi blocking list 
   and also drop George's Reviewed-by.
 - In patch 3, introduce a new function to find proper pCPU to accept
 the blocked vcpu.
 - In patch 4, add support of tracking the operations on pi blocking list
 and generating scatterplot of pi list length

VT-d PI introduces a per-pCPU blocking list to track the blocked vCPU
on a given pCPU. Theoretically, there are 32K domain on single host,
128 vCPUs per domain. If all vCPUs are blocked on the same pCPU,
4M vCPUs are in the same list. Traversing this list consumes too
much time. More discussion can be found in [1,2,3].

To mitigate this issue, this series put vcpus to another pcpu's list
when the local pcpu's list length reachs an upper bound which is the
average vcpus per pcpu ratio plus a constant. 

PATCH 1/4 adds a counter to track the per-pCPU blocking list's length.

PATCH 2/4 uses a global variable to track how many hvm vcpus on this
system. It is used to calculate the average vcpus per pcpu ratio.

patch 3/4 employs a policy to restrict the vcpu count on a given
pcpu's pi blocking list in case the list grows too long. In one work,
If list length is smaller than the upper bound, the vcpu is added to
the pi blocking list of the pcpu which it is running on. Otherwise,
another online pcpu is chosen to accept the vcpu.

patch 4/4 adds some relevant events to xentrace to aid analysis of
the list length. With this patch, some data can be acquired to
validate patch 3/4. 

[1] 
https://lists.gt.net/xen/devel/422661?search_string=VT-d%20posted-interrupt%20core%20logic%20handling;#422661
[2] 
https://lists.gt.net/xen/devel/422567?search_string=%20The%20length%20of%20the%20list%20depends;#422567
[3] 
https://lists.gt.net/xen/devel/472749?search_string=enable%20vt-d%20pi%20by%20default;#472749

Chao Gao (4):
  VT-d PI: track the number of vcpus on pi blocking list
  x86/vcpu: track hvm vcpu number on the system
  VT-d PI: restrict the number of vcpus in a given pcpu's PI blocking
list
  xentrace: add support for HVM's PI blocking list operation

 tools/xentrace/formats|   2 +
 tools/xentrace/xenalyze.c | 116 +
 xen/arch/x86/hvm/hvm.c|   6 ++
 xen/arch/x86/hvm/vmx/vmx.c| 166 --
 xen/include/asm-x86/hvm/hvm.h |   3 +
 xen/include/public/trace.h|   5 ++
 6 files changed, 275 insertions(+), 23 deletions(-)

-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v5 2/4] x86/vcpu: track hvm vcpu number on the system

2017-08-15 Thread Chao Gao
This number is used to calculate the average vcpus per pcpu ratio.

Signed-off-by: Chao Gao <chao@intel.com>
Acked-by: Jan Beulich <jbeul...@suse.com>
---
v4:
 - move the place we increase/decrease the hvm vcpu number to
 hvm_vcpu_{initialise, destory}

---
 xen/arch/x86/hvm/hvm.c| 6 ++
 xen/include/asm-x86/hvm/hvm.h | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 555133f..37afdb4 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -109,6 +109,9 @@ static const char __initconst warning_hvm_fep[] =
 static bool_t __initdata opt_altp2m_enabled = 0;
 boolean_param("altp2m", opt_altp2m_enabled);
 
+/* Total number of HVM vCPUs on this system */
+atomic_t num_hvm_vcpus;
+
 static int cpu_callback(
 struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
@@ -1511,6 +1514,7 @@ int hvm_vcpu_initialise(struct vcpu *v)
 
 hvm_update_guest_vendor(v);
 
+atomic_inc(_hvm_vcpus);
 return 0;
 
  fail6:
@@ -1529,6 +1533,8 @@ int hvm_vcpu_initialise(struct vcpu *v)
 
 void hvm_vcpu_destroy(struct vcpu *v)
 {
+atomic_dec(_hvm_vcpus);
+
 viridian_vcpu_deinit(v);
 
 hvm_all_ioreq_servers_remove_vcpu(v->domain, v);
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index b687e03..c51bd9f 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #ifdef CONFIG_HVM_FEP
@@ -233,6 +234,8 @@ extern bool_t hvm_enabled;
 extern bool_t cpu_has_lmsl;
 extern s8 hvm_port80_allowed;
 
+extern atomic_t num_hvm_vcpus;
+
 extern const struct hvm_function_table *start_svm(void);
 extern const struct hvm_function_table *start_vmx(void);
 
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH v5 3/4] VT-d PI: restrict the number of vcpus in a given pcpu's PI blocking list

2017-08-15 Thread Chao Gao
Currently, a blocked vCPU is put in its pCPU's pi blocking list. If
too many vCPUs are blocked on a given pCPU, it will incur that the list
grows too long. After a simple analysis, there are 32k domains and
128 vcpu per domain, thus about 4M vCPUs may be blocked in one pCPU's
PI blocking list. When a wakeup interrupt arrives, the list is
traversed to wake up vCPUs which have events pending. This traversal in
that case would consume much time.

To mitigate this issue, this patch limits the number of vCPUs tracked by a
given pCPU's blocking list, taking factors such as perfomance of common case,
current hvm vCPU count and current pCPU count into consideration. With this
method, for the common case, it works fast and for some extreme cases, the
list length is under control.

With this patch, when a vcpu is to be blocked, we check whether the pi
blocking list's length of the pcpu where the vcpu is running exceeds
the limit which is the average vcpus per pcpu ratio plus a constant.
If no, the vcpu is added to this pcpu's pi blocking list. Otherwise,
another online pcpu is chosen to accept the vcpu.

Signed-off-by: Chao Gao <chao@intel.com>
---
v5:
 - Introduce a function to choose the suitable pcpu to accept the blocked
 vcpu.
v4:
 - use a new lock to avoid adding a blocked vcpu to a offline pcpu's blocking
 list.

---
 xen/arch/x86/hvm/vmx/vmx.c | 109 -
 1 file changed, 97 insertions(+), 12 deletions(-)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index bf17988..646f409 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -119,16 +119,85 @@ static void vmx_pi_del_vcpu(struct pi_blocking_vcpu *pbv,
 add_sized(>counter, -1);
 }
 
+/*
+ * By default, the local pcpu (namely, the one the vcpu is currently running 
on)
+ * is chosen as the destination of wakeup interrupt. But if the number of vcpus
+ * in the default pcpu's PI blocking list exceeds a limit, another suitable
+ * pcpu is chosen as the destination by iterating through all online pcpus.
+ *
+ * Currently, choose (v_tot/p_tot) + K as the limit of vcpus, where
+ * v_tot is the total number of hvm vcpus on the system, p_tot is the total
+ * number of pcpus in the system, and K is a fixed number. An experiment on a
+ * skylake server which has 112 cpus and 64G memory shows the maximum time of
+ * waking up a vcpu from a 128-entry blocking list is about 22us, which is
+ * tolerable. So choose 128 as the fixed number K.
+ *
+ * This policy makes sure:
+ * 1) for common cases, the limit won't be reached and the local pcpu is used
+ * which is beneficial to performance (at least, avoid an IPI when unblocking
+ * vcpu).
+ * 2) for the worst case, the blocking list length scales with the vcpu count
+ * divided by the pcpu count.
+ */
+#define PI_LIST_FIXED_LIMIT 128
+
+static inline bool pi_over_limit(unsigned int cpu)
+{
+/* Compare w/ constant first to save a division and an add */
+if ( likely(read_atomic(_cpu(vmx_pi_blocking, cpu).counter) <=
+PI_LIST_FIXED_LIMIT) )
+return 0;
+else
+return read_atomic(_cpu(vmx_pi_blocking, cpu).counter) >=
+   (atomic_read(_hvm_vcpus) / num_online_cpus()) +
+   PI_LIST_FIXED_LIMIT;
+}
+
+/*
+ * Start from @cpu and iterate cpu_online_map to look for one cpu whose
+ * blocking list length is under limit. Return with holding a lock to avoid
+ * others adding entries to the chosen cpu.
+ * There must be at least one suitable cpu for the limit is greater than the
+ * average number of all cpus' blocking list length.
+ */
+static unsigned int pi_get_blocking_cpu(unsigned int cpu, unsigned long *flags)
+{
+spinlock_t *pi_blocking_list_lock;
+
+for ( ; ; )
+{
+while ( unlikely(pi_over_limit(cpu)) )
+cpu = cpumask_cycle(cpu, _online_map);
+
+pi_blocking_list_lock = _cpu(vmx_pi_blocking, cpu).lock;
+if ( flags )
+spin_lock_irqsave(pi_blocking_list_lock, *flags);
+else
+spin_lock(pi_blocking_list_lock);
+/*
+ * check again in case the list length exceeds the limit during taking
+ * the lock
+ */
+if ( !pi_over_limit(cpu) )
+break;
+else if ( flags )
+spin_unlock_irqrestore(pi_blocking_list_lock, *flags);
+else
+spin_unlock(pi_blocking_list_lock);
+}
+
+return cpu;
+}
+
 static void vmx_vcpu_block(struct vcpu *v)
 {
 unsigned long flags;
-unsigned int dest;
-spinlock_t *old_lock;
-spinlock_t *pi_blocking_list_lock =
-   _cpu(vmx_pi_blocking, v->processor).lock;
+unsigned int dest, pi_cpu;
+spinlock_t *old_lock, *pi_blocking_list_lock;
 struct pi_desc *pi_desc = >arch.hvm_vmx.pi_desc;
 
-spin_lock_irqsave(pi_blocking_list_lock, flags);
+pi_cpu = pi_get_blocking_cpu(v->processor, );
+pi_b

[Xen-devel] [PATCH v6] VT-d: fix VF of RC integrated PF matched to wrong VT-d unit

2017-08-15 Thread Chao Gao
The problem is for a VF of RC integrated PF (e.g. PF's BDF is
00:02.0), we would wrongly use 00:00.0 to search VT-d unit.

If a PF is an extended function, the BDF of a traditional function
within the same device should be used to search VT-d unit. Otherwise,
the real BDF of PF should be used. According PCI-e spec, an extended
function is a function within an ARI device and Function Number is
greater than 7. The original code tried to tell apart Extended
Function and non-Extended Function through checking PCI_SLOT(),
missing counterpart of pci_ari_enabled() (this function exists in
linux kernel) compared to linux kernel. Without checking whether ARI
is enabled, it incurs a RC integrated PF with PCI_SLOT() >0 is wrongly
classified to an extended function. Note that a RC integrated function
isn't within an ARI device and thus cannot be extended function and in
this case the real BDF should be used.

This patch introduces a new field, pf_is_extfn, in struct
pci_dev_info, to indicate whether the physical function is an extended
function. The new field helps to generate correct BDF to search VT-d
unit.

Reported-by: Crawford, Eric R <eric.r.crawf...@intel.com>
Tested-by: Crawford, Eric R <eric.r.crawf...@intel.com>
Signed-off-by: Chao Gao <chao@intel.com>
---
 xen/drivers/passthrough/pci.c  | 6 +-
 xen/drivers/passthrough/vtd/dmar.c | 2 +-
 xen/include/xen/pci.h  | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 27bdb71..8c2ba33 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -599,6 +599,7 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
 unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
 const char *pdev_type;
 int ret;
+bool pf_is_extfn = false;
 
 if (!info)
 pdev_type = "device";
@@ -609,7 +610,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
 pcidevs_lock();
 pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn);
 pcidevs_unlock();
-if ( !pdev )
+if ( pdev )
+pf_is_extfn = pdev->info.is_extfn;
+else
 pci_add_device(seg, info->physfn.bus, info->physfn.devfn,
NULL, node);
 pdev_type = "virtual function";
@@ -707,6 +710,7 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
seg, bus, slot, func, ctrl);
 }
 
+pdev->info.pf_is_extfn = pf_is_extfn;
 check_pdev(pdev);
 
 ret = 0;
diff --git a/xen/drivers/passthrough/vtd/dmar.c 
b/xen/drivers/passthrough/vtd/dmar.c
index 82040dd..a96558f 100644
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -219,7 +219,7 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const 
struct pci_dev *pdev)
 else if ( pdev->info.is_virtfn )
 {
 bus = pdev->info.physfn.bus;
-devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : 
pdev->info.physfn.devfn;
+devfn = pdev->info.pf_is_extfn ? 0 : pdev->info.physfn.devfn;
 }
 else
 {
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index 59b6e8a..9e76aa0 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -40,6 +40,7 @@
 
 struct pci_dev_info {
 bool_t is_extfn;
+bool_t pf_is_extfn; /* Only valid for virtual function */
 bool_t is_virtfn;
 struct {
 u8 bus;
-- 
1.8.3.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [Question] how to avoid Xen using a memory range?

2017-08-04 Thread Chao Gao
Hi, everyone.

I have a machine which has two numa nodes. The NODE0 contains
memory range from 0 to 0x18400MB and NODE1 contains memory range
from 0x18400MB to 0x1c400MB. The resource available to dom0 is
restricted through adding "dom0_mem=10G dom0_nodes=0 dom0_max_vcpus=48"
to Xen Command Line. Even though no guest is created, over 1GB memory of
NODE1 is consumed according the output of 'xl info -n'. From 'xl
debug-keys u', I found dom0 used a lot of pages of NODE1. And after
adding "highmem-start=0x18400MB" to Xen Command Line to avoid relocating
dom0's image to NODE1, near 1GB memory of NODE1 is still in
use. Considering no cpu on NODE1, I think Xen won't allocate memory from
NODE1 if no numa node is specified. Do you have some ideas about what is 
using the memory of NODE1? and how to avoid this?

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v4 4/4] Xentrace: add support for HVM's PI blocking list operation

2017-07-28 Thread Chao Gao
On Fri, Jul 21, 2017 at 05:26:47PM +0100, George Dunlap wrote:
>On Fri, Jul 7, 2017 at 7:49 AM, Chao Gao <chao@intel.com> wrote:
>> In order to analyze PI blocking list operation frequence and obtain
>> the list length, add some relevant events to xentrace and some
>> associated code in xenalyze. Event ASYNC_PI_LIST_DEL may happen in interrupt
>> context, which incurs current assumptions checked in toplevel_assert_check()
>> are not suitable any more. Thus, this patch extends the 
>> toplevel_assert_check()
>> to remove such assumptions for events of type ASYNC_PI_LIST_DEL.
>>
>> Signed-off-by: Chao Gao <chao@intel.com>
>
>Hey Chao Gao,
>
>Thanks for doing the work to add this tracing support to xentrace --
>and in particular taking the effort to adapt the assert mechanism to
>be able to handle asynchronous events.
>
>I think in this case though, having a separate HVM sub-class for
>asynchronous events isn't really the right approach.  The main purpose
>of sub-classes is to help filter the events you want; and I can't
>think of any time you'd want to trace PI_LIST_DEL and not PI_LIST_ADD
>(or vice versa).  Secondly, the "asynchronous event" problem will be
>an issue for other contexts as well, and the solution will be the
>same.
>
>I think a better solution would be to do something similar to
>TRC_64_FLAG and TRC_HVM_IOMEM_[read,write], and claim another bit to
>create a TRC_ASYNC_FLAG (0x400 probably).  Then we can filter the
>"not_idle_domain" and "vcpu_data_mode" asserts on that.
>
>What do you think?

It makes sense to me. Your other comments on this series are also fine
to me. I will cook another version based on your suggestions.

Thanks
Chao

___
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel


[Xen-devel] [PATCH] Revert "VT-d: fix VF of RC integrated PF matched to wrong VT-d unit"

2017-07-25 Thread Chao Gao
This reverts commit 89df98b77d28136c4d7aade13a1c8bc154d2919f, which
incurs Xen crash when loading VF driver. The reason seems that
pci_get_pdev() can't be called when interrupt is disabled. I don't have a
quick solution to fix this; therefore revert this patch to let common cases
work well. As to the corner case I intended to fix, I will propose another
solution later.

Below is the call trace of Xen crash:
(XEN) Xen BUG at spinlock.c:47
(XEN) [ Xen-4.10-unstable  x86_64  debug=y   Tainted:  C   ]
(XEN) CPU:2
(XEN) RIP:e008:[] spinlock.c#check_lock+0x3c/0x40
(XEN) RFLAGS: 00010046   CONTEXT: hypervisor (d0v2)
(XEN) rax:    rbx: 82d08043b9c8   rcx: 0001
(XEN) rdx:    rsi:    rdi: 82d08043b9ce
(XEN) rbp: 83043c47fa50   rsp: 83043c47fa50   r8:  
(XEN) r9:     r10:    r11: 
(XEN) r12: 0001   r13:    r14: 0072
(XEN) r15: 83043c006c00   cr0: 80050033   cr4: 003526e0
(XEN) cr3: 00081b39a000   cr2: 88016c058548
(XEN) ds:    es:    fs:    gs:    ss: e010   cs: e008
(XEN) Xen code around  (spinlock.c#check_lock+0x3c/0x40):
(XEN)  98 83 f2 01 39 d0 75 02 <0f> 0b 5d c3 55 48 89 e5 f0 ff 05 a1 f6 1e 00 5d
(XEN) Xen stack trace from rsp=83043c47fa50:
(XEN)83043c47fa68 82d080235234 0005 83043c47fa78
(XEN)82d080251df3 83043c47fab8 82d080251e80 83043c47fac8
(XEN)83043c422580 83042e973cd0 0005 83042e9609e0
(XEN)0072 83043c47fae8 82d08025795a 83043c47fb18
(XEN)83043c47fc18 83043c47fc18 83042e9609e0 83043c47fba8
(XEN)82d080259be1 83043c47fb10 82d08023516b 0246
(XEN)83043c47fb28 0206 0002 83043c47fb58
(XEN)82d080290e38 83042e973cd0 83043c532000 83043c532000
(XEN)83042e973db0 83043c47fb68 82d080354dd0 83043c47fc18
(XEN)82d080274e07 0040 83042e9609e0 83043c47fc18
(XEN)83043c47fc18 0072 83043c006c00 83043c47fbb8
(XEN)82d0802526f7 83043c47fc08 82d080273c17 83043ff99d90
(XEN)83043c006c00 83043c47fc08 83043c006c00 83042e9609e0
(XEN)83043c47fc18 0072 83043c006c00 83043c47fc48
(XEN)82d0802754d1 feeff00c 0fff41ca 0002
(XEN)83042e9609e0 83042e973cd0 0002 83043c47fc88
(XEN)82d0802755a8 83043c47fc70 0246 83043c532000
(XEN)006c 83043c006c00  83043c47fd28
(XEN)82d080279b4f 83043c532000 83043c47fe00 83043c47fcd8
(XEN)83042e973d20 83043c47fcf0 83040325 0246
(XEN) Xen call trace:
(XEN)[] spinlock.c#check_lock+0x3c/0x40
(XEN)[] _spin_is_locked+0x11/0x4d
(XEN)[] pcidevs_locked+0x10/0x17
(XEN)[] pci_get_pdev+0x2f/0xfd
(XEN)[] acpi_find_matched_drhd_unit+0x4d/0x11a
(XEN)[] msi_msg_write_remap_rte+0x2f/0x749
(XEN)[] iommu_update_ire_from_msi+0x36/0x38
(XEN)[] msi.c#write_msi_msg+0x3f/0x188
(XEN)[] __setup_msi_irq+0x3a/0x5c
(XEN)[] setup_msi_irq+0xb5/0xf7
(XEN)[] map_domain_pirq+0x445/0x653
(XEN)[] allocate_and_map_msi_pirq+0x10d/0x184
(XEN)[] physdev_map_pirq+0x1f8/0x26b
(XEN)[] do_physdev_op+0x595/0x110f
(XEN)[] pv_hypercall+0x1ef/0x42c
(XEN)[] entry.o#test_all_events+0/0x30
(XEN)
(XEN)
(XEN) 
(XEN) Panic on CPU 2:
(XEN) Xen BUG at spinlock.c:47
(XEN) 
(XEN)
(XEN) Reboot in five seconds...

Signed-off-by: Chao Gao <chao@intel.com>
---
 xen/drivers/passthrough/vtd/dmar.c | 11 +--
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/xen/drivers/passthrough/vtd/dmar.c 
b/xen/drivers/passthrough/vtd/dmar.c
index 8a3e240..82040dd 100644
--- a/xen/drivers/passthrough/vtd/dmar.c
+++ b/xen/drivers/passthrough/vtd/dmar.c
@@ -218,17 +218,8 @@ struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const 
struct pci_dev *pdev)
 }
 else if ( pdev->info.is_virtfn )
 {
-const struct pci_dev *physfn;
-
 bus = pdev->info.physfn.bus;
-/*
- * Use 0 as 'devfn' to search VT-d unit when the physical function
- * is an Extended Function.
- */
-pcidevs_lock();
-physfn = pci_get_pdev(pdev->seg, bus, pdev->info.physfn.devfn);
-devfn = (physfn && physfn->info.is_extfn) ? 0 : 
pdev->info.physfn.devfn;
-pcidevs_unlock();
+devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : 
pdev->info.physfn.devfn;
 }
 else
 {
-- 
1.8.3.1


___
Xen-deve

  1   2   3   >