Re: [patch V2 00/46] x86, PCI, XEN, genirq ...: Prepare for device MSI

2020-09-01 Thread Boqun Feng
Hi Thomas,

On Wed, Aug 26, 2020 at 01:16:28PM +0200, Thomas Gleixner wrote:
[...]
> 
> The whole lot is also available from git:
> 
>git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git device-msi
> 
> This has been tested on Intel/AMD/KVM but lacks testing on:
> 
> - HYPERV (-ENODEV)

FWIW, I did a build and boot test in a hyperv guest with your
development branch, the latest commit is 71cbf478eb6f ("irqchip: Add
IMS (Interrupt Message Storm) driver - NOT FOR MERGING"). And everything
seemed working fine.

If you want me to set/unset a particular CONFIG option or run some
command for testing purposes, please let me know ;-)

Regards,
Bqoun

> - VMD enabled systems (-ENODEV)
> - XEN (-ENOCLUE)
> - IMS (-ENODEV)
> 
> - Any non-X86 code which might depend on the broken compose MSI message
>   logic. Marc excpects not much fallout, but agrees that we need to fix
>   it anyway.
> 
> #1 - #3 should be applied unconditionally for obvious reasons
> #4 - #6 are wortwhile cleanups which should be done independent of device MSI
> 
> #7 - #8 look promising to cleanup the platform MSI implementation
>   independent of #8, but I neither had cycles nor the stomach to
>   tackle that.
> 
> #9is obviously just for the folks interested in IMS
> 
> Thanks,
> 
>   tglx



Re: [patch RFC 10/38] x86/ioapic: Consolidate IOAPIC allocation

2020-08-26 Thread Boqun Feng
Hi Thomas,

I hit a compiler error while I was trying to compile this patchset:

arch/x86/kernel/devicetree.c: In function ‘dt_irqdomain_alloc’:
arch/x86/kernel/devicetree.c:232:6: error: ‘struct irq_alloc_info’ has no 
member named ‘ioapic_id’; did you mean ‘ioapic’?
  232 |  tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
  |  ^
  |  ioapic
arch/x86/kernel/devicetree.c:233:6: error: ‘struct irq_alloc_info’ has no 
member named ‘ioapic_pin’; did you mean ‘ioapic’?
  233 |  tmp.ioapic_pin = fwspec->param[0]
  |  ^~
  |  ioapic

with CONFIG_OF=y. IIUC, the following changes are needed to fold into
this patch. (At least I can continue to compile the kernel with this
change)

diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index a0e8fc7d85f1..ddffd80f5c52 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -229,8 +229,8 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, 
unsigned int virq,
 
it = _ioapic_type[type_index];
ioapic_set_alloc_attr(, NUMA_NO_NODE, it->trigger, it->polarity);
-   tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
-   tmp.ioapic_pin = fwspec->param[0];
+   tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
+   tmp.ioapic.pin = fwspec->param[0];
 
return mp_irqdomain_alloc(domain, virq, nr_irqs, );
 }

Regards,
Boqun

On Fri, Aug 21, 2020 at 02:24:34AM +0200, Thomas Gleixner wrote:
> Move the IOAPIC specific fields into their own struct and reuse the common
> devid. Get rid of the #ifdeffery as it does not matter at all whether the
> alloc info is a couple of bytes longer or not.
> 
> Signed-off-by: Thomas Gleixner 
> Cc: Wei Liu 
> Cc: "K. Y. Srinivasan" 
> Cc: Stephen Hemminger 
> Cc: Joerg Roedel 
> Cc: linux-hyp...@vger.kernel.org
> Cc: io...@lists.linux-foundation.org
> Cc: Haiyang Zhang 
> Cc: Jon Derrick 
> Cc: Lu Baolu 
> ---
>  arch/x86/include/asm/hw_irq.h   |   23 ++-
>  arch/x86/kernel/apic/io_apic.c  |   70 
> ++--
>  drivers/iommu/amd/iommu.c   |   14 +++
>  drivers/iommu/hyperv-iommu.c|2 -
>  drivers/iommu/intel/irq_remapping.c |   18 -
>  5 files changed, 64 insertions(+), 63 deletions(-)
> 
> --- a/arch/x86/include/asm/hw_irq.h
> +++ b/arch/x86/include/asm/hw_irq.h
> @@ -44,6 +44,15 @@ enum irq_alloc_type {
>   X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT,
>  };
>  
> +struct ioapic_alloc_info {
> + int pin;
> + int node;
> + u32 trigger : 1;
> + u32 polarity : 1;
> + u32 valid : 1;
> + struct IO_APIC_route_entry  *entry;
> +};
> +
>  /**
>   * irq_alloc_info - X86 specific interrupt allocation info
>   * @type:X86 specific allocation type
> @@ -53,6 +62,8 @@ enum irq_alloc_type {
>   * @mask:CPU mask for vector allocation
>   * @desc:Pointer to msi descriptor
>   * @data:Allocation specific data
> + *
> + * @ioapic:  IOAPIC specific allocation data
>   */
>  struct irq_alloc_info {
>   enum irq_alloc_type type;
> @@ -64,6 +75,7 @@ struct irq_alloc_info {
>   void*data;
>  
>   union {
> + struct ioapic_alloc_infoioapic;
>   int unused;
>  #ifdef   CONFIG_PCI_MSI
>   struct {
> @@ -71,17 +83,6 @@ struct irq_alloc_info {
>   irq_hw_number_t msi_hwirq;
>   };
>  #endif
> -#ifdef   CONFIG_X86_IO_APIC
> - struct {
> - int ioapic_id;
> - int ioapic_pin;
> - int ioapic_node;
> - u32 ioapic_trigger : 1;
> - u32 ioapic_polarity : 1;
> - u32 ioapic_valid : 1;
> - struct IO_APIC_route_entry *ioapic_entry;
> - };
> -#endif
>  #ifdef   CONFIG_DMAR_TABLE
>   struct {
>   int dmar_id;
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -860,10 +860,10 @@ void ioapic_set_alloc_attr(struct irq_al
>  {
>   init_irq_alloc_info(info, NULL);
>   info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
> - info->ioapic_node = node;
> - info->ioapic_trigger = trigger;
> - info->ioapic_polarity = polarity;
> - info->ioapic_valid = 1;
> + info->ioapic.node = node;
> + info->ioapic.trigger = trigger;
> + info->ioapic.polarity = polarity;
> + info->ioapic.valid = 1;
>  }
>  
>  #ifndef CONFIG_ACPI
> @@ -878,32 +878,32 @@ static void ioapic_copy_alloc_attr(struc
>  
>   copy_irq_alloc_info(dst, src);
>   dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
> - dst->ioapic_id = 

Re: [Xen-devel] [RFC 0/6] vDSO support for Hyper-V guest on ARM64

2020-01-27 Thread Boqun Feng
On Fri, Jan 24, 2020 at 10:24:44AM +, Vincenzo Frascino wrote:
> Hi Boqun Feng,
> 
> On 24/01/2020 06:32, Boqun Feng wrote:
> > Hi Vincenzo,
> > 
> 
> [...]
> 
> >>
> >> I had a look to your patches and overall, I could not understand why we 
> >> can't
> >> use the arch_timer to do the same things you are doing with the one you
> >> introduced in this series. What confuses me is that KVM works just fine 
> >> with the
> >> arch_timer which was designed with virtualization in mind. Why do we need
> >> another one? Could you please explain?
> >>
> > 
> > Please note that the guest VM on Hyper-V for ARM64 doesn't use
> > arch_timer as the clocksource. See:
> > 
> > 
> > https://lore.kernel.org/linux-arm-kernel/1570129355-16005-7-git-send-email-mikel...@microsoft.com/
> > 
> > ,  ACPI_SIG_GTDT is used for setting up Hyper-V synthetic clocksource
> > and other initialization work.
> >
> 
> I had a look a look at it and my question stands, why do we need another timer
> on arm64?
> 

Sorry for the late response. It's weekend and Chinese New Year, so I got
to spend some time making (and mostly eating) dumplings ;-)

After discussion with Michael, here is some explanation why we need
another timer:

The synthetic clocks that Hyper-V presents in a guest VM were originally
created for the x86 architecture. They provide a level of abstraction
that solves problems like continuity across live migrations where the
hardware clock (i.e., TSC in the case x86) frequency may be different
across the migration. When Hyper-V was brought to ARM64, this
abstraction was maintained to provide consistency across the x86 and
ARM64 architectures, and for both Windows and Linux guest VMs.   The
core Linux code for the Hyper-V clocks (in
drivers/clocksource/hyperv_timer.c) is architecture neutral and works on
both x86 and ARM64. As you can see, this part is done in Michael's
patchset.

Arguably, Hyper-V for ARM64 should have optimized for consistency with
the ARM64 community rather with the existing x86 implementation and
existing guest code in Windows. But at this point, it is what it is,
and the Hyper-V clocks do solve problems like migration that aren’t
addressed in ARM64 until v8.4 of the architecture with the addition of
the counter hardware scaling feature. Hyper-V doesn’t currently map the
ARM arch timer interrupts into guest VMs, so we need to use the existing
Hyper-V clocks and the common code that already exists.


Does the above answer your question?

Regards,
Boqun

> > So just to be clear, your suggestion is
> > 
> > 1) Hyper-V guest on ARM64 should use arch_timer as clocksource and vDSO
> > will just work.
> > 
> > or
> > 
> > 2) Even though arch_timer is not used as the clocksource, we can still
> > use it for vDSO.
> > 
> > ?
> > 
> 
> Option #1 would be the preferred solution, unless there is a good reason 
> against.
> 
> > Regards,
> > Boqun
> > 
> 
> -- 
> Regards,
> Vincenzo



___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [RFC 0/6] vDSO support for Hyper-V guest on ARM64

2020-01-23 Thread Boqun Feng
Hi Vincenzo,

On Thu, Jan 23, 2020 at 10:48:07AM +, Vincenzo Frascino wrote:
> Hi Boqun Feng,
> 
> sorry for the late reply.
> 

That's OK, thanks for your review ;-)

> On 16/12/2019 00:19, Boqun Feng wrote:
> > Hi,
> > 
> > This is the RFC patchset for vDSO support in ARM64 Hyper-V guest. To
> > test it, Michael's ARM64 support patchset:
> > 
> > 
> > https://lore.kernel.org/linux-arm-kernel/1570129355-16005-1-git-send-email-mikel...@microsoft.com/
> > 
> > is needed.
> > 
> > Similar as x86, Hyper-V on ARM64 use a TSC page for guests to read
> > the virtualized hardware timer, this TSC page is read-only for the
> > guests, so could be used for vDSO data page. And the vDSO (userspace)
> > code could use the same code for timer reading as kernel, since
> > they read the same TSC page.
> > 
> 
> I had a look to your patches and overall, I could not understand why we can't
> use the arch_timer to do the same things you are doing with the one you
> introduced in this series. What confuses me is that KVM works just fine with 
> the
> arch_timer which was designed with virtualization in mind. Why do we need
> another one? Could you please explain?
> 

Please note that the guest VM on Hyper-V for ARM64 doesn't use
arch_timer as the clocksource. See:


https://lore.kernel.org/linux-arm-kernel/1570129355-16005-7-git-send-email-mikel...@microsoft.com/

,  ACPI_SIG_GTDT is used for setting up Hyper-V synthetic clocksource
and other initialization work.

So just to be clear, your suggestion is

1) Hyper-V guest on ARM64 should use arch_timer as clocksource and vDSO
will just work.

or

2) Even though arch_timer is not used as the clocksource, we can still
use it for vDSO.

?

Regards,
Boqun

> > This patchset therefore extends ARM64's __vsdo_init() to allow multiple
> > data pages and introduces the vclock_mode concept similar to x86 to
> > allow different platforms (bare-metal, Hyper-V, etc.) to switch to
> > different __arch_get_hw_counter() implementations. The rest of this
> > patchset does the necessary setup for Hyper-V guests: mapping tsc page,
> > enabling userspace to read cntvct, etc. to enable vDSO.
> > 
> > This patchset consists of 6 patches:
> > 
> > patch #1 allows hv_get_raw_timer() definition to be overridden for
> > userspace and kernel to share the same hv_read_tsc_page() definition.
> > 
> > patch #2 extends ARM64 to support multiple vDSO data pages.
> > 
> > patch #3 introduces vclock_mode similiar to x86 to allow different
> > __arch_get_hw_counter() implementations for different clocksources.
> > 
> > patch #4 maps Hyper-V TSC page into vDSO data page.
> > 
> > patch #5 allows userspace to read cntvct, so that userspace can
> > efficiently read the clocksource.
> > 
> > patch #6 enables the vDSO for ARM64 Hyper-V guest.
> > 
> > The whole patchset is based on v5.5-rc1 plus Michael's ARM64 support
> > patchset, and I've done a few tests with:
> > 
> > https://github.com/nlynch-mentor/vdsotest
> > 
> > Comments and suggestions are welcome!
> > 
> > Regards,
> > Boqun
> > 
> > ___
> > linux-arm-kernel mailing list
> > linux-arm-ker...@lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
> > 
> 
> -- 
> Regards,
> Vincenzo



___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [RFC 6/6] arm64: hyperv: Enable vDSO

2019-12-17 Thread Boqun Feng
On Tue, Dec 17, 2019 at 03:10:16PM +0100, Vitaly Kuznetsov wrote:
> Boqun Feng  writes:
> 
> > Similar to x86, add a new vclock_mode VCLOCK_HVCLOCK, and reuse the
> > hv_read_tsc_page() for userspace to read tsc page clocksource.
> >
> > Signed-off-by: Boqun Feng (Microsoft) 
> > ---
> >  arch/arm64/include/asm/clocksource.h   |  3 ++-
> >  arch/arm64/include/asm/mshyperv.h  |  2 +-
> >  arch/arm64/include/asm/vdso/gettimeofday.h | 19 +++
> >  3 files changed, 22 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/clocksource.h 
> > b/arch/arm64/include/asm/clocksource.h
> > index fbe80057468c..c6acd45fe748 100644
> > --- a/arch/arm64/include/asm/clocksource.h
> > +++ b/arch/arm64/include/asm/clocksource.h
> > @@ -4,7 +4,8 @@
> >  
> >  #define VCLOCK_NONE0   /* No vDSO clock available. 
> > */
> >  #define VCLOCK_CNTVCT  1   /* vDSO should use cntvcnt  
> > */
> > -#define VCLOCK_MAX 1
> > +#define VCLOCK_HVCLOCK 2   /* vDSO should use vread_hvclock()  
> > */
> > +#define VCLOCK_MAX 2
> >  
> >  struct arch_clocksource_data {
> > int vclock_mode;
> > diff --git a/arch/arm64/include/asm/mshyperv.h 
> > b/arch/arm64/include/asm/mshyperv.h
> > index 0afb00e3501d..7c85dd816dca 100644
> > --- a/arch/arm64/include/asm/mshyperv.h
> > +++ b/arch/arm64/include/asm/mshyperv.h
> > @@ -90,7 +90,7 @@ extern void hv_get_vpreg_128(u32 reg, struct 
> > hv_get_vp_register_output *result);
> >  #define hv_set_reference_tsc(val) \
> > hv_set_vpreg(HV_REGISTER_REFERENCE_TSC, val)
> >  #define hv_set_clocksource_vdso(val) \
> > -   ((val).archdata.vclock_mode = VCLOCK_NONE)
> > +   ((val).archdata.vclock_mode = VCLOCK_HVCLOCK)
> >  
> >  #if IS_ENABLED(CONFIG_HYPERV)
> >  #define hv_enable_stimer0_percpu_irq(irq)  enable_percpu_irq(irq, 0)
> > diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h 
> > b/arch/arm64/include/asm/vdso/gettimeofday.h
> > index e6e3fe0488c7..7e689b903f4d 100644
> > --- a/arch/arm64/include/asm/vdso/gettimeofday.h
> > +++ b/arch/arm64/include/asm/vdso/gettimeofday.h
> > @@ -67,6 +67,20 @@ int clock_getres_fallback(clockid_t _clkid, struct 
> > __kernel_timespec *_ts)
> > return ret;
> >  }
> >  
> > +#ifdef CONFIG_HYPERV_TIMER
> > +/* This will override the default hv_get_raw_timer() */
> > +#define hv_get_raw_timer() __arch_counter_get_cntvct()
> > +#include 
> > +
> > +extern struct ms_hyperv_tsc_page
> > +_hvclock_page __attribute__((visibility("hidden")));
> > +
> > +static u64 vread_hvclock(void)
> > +{
> > +   return hv_read_tsc_page(&_hvclock_page);
> > +}
> > +#endif
> 
> The function is almost the same on x86 (&_hvclock_page ->
> _page), would it maybe make sense to move this to arch neutral
> clocksource/hyperv_timer.h?
> 

I'm not sure whether the underscore matters in the vDSO data symbol, so
I follow the architectural name convention. If the leading underscore
doesn't have special purpose I'm happy to move this to arch neutral
header file.

> > +
> >  static __always_inline u64 __arch_get_hw_counter(s32 clock_mode)
> >  {
> > u64 res;
> > @@ -78,6 +92,11 @@ static __always_inline u64 __arch_get_hw_counter(s32 
> > clock_mode)
> > if (clock_mode == VCLOCK_NONE)
> > return __VDSO_USE_SYSCALL;
> >  
> > +#ifdef CONFIG_HYPERV_TIMER
> > +   if (likely(clock_mode == VCLOCK_HVCLOCK))
> > +   return vread_hvclock();
> 
> I'm not sure likely() is justified here: it'll make ALL builds which
> enable CONFIG_HYPERV_TIMER (e.g. distro kernels) to prefer
> VCLOCK_HVCLOCK, even if the kernel is not running on Hyper-V.
> 

Make sense. Thanks for pointing this out! I will change it in the next
version.

Regards,
Boqun

> > +#endif
> > +
> > /*
> >  * This isb() is required to prevent that the counter value
> >  * is speculated.
> 
> -- 
> Vitaly
> 

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [RFC 5/6] arm64: hyperv: Enable userspace to read cntvct

2019-12-15 Thread Boqun Feng
Since reading hyperv-timer clocksource requires reading cntvct,
userspace should be allowed to read it, otherwise reading cntvct will
result in traps, which makes vsyscall's cost similar compared to
syscall's.

So enable it on every cpu when a Hyper-V guest booting up.

Signed-off-by: Boqun Feng (Microsoft) 
---
 arch/arm64/hyperv/hv_init.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm64/hyperv/hv_init.c b/arch/arm64/hyperv/hv_init.c
index 86e4621d5885..1ea97ecfb143 100644
--- a/arch/arm64/hyperv/hv_init.c
+++ b/arch/arm64/hyperv/hv_init.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -45,6 +46,7 @@ EXPORT_SYMBOL_GPL(hv_max_vp_index);
 static int hv_cpu_init(unsigned int cpu)
 {
u64 msr_vp_index;
+   u32 cntkctl;
 
hv_get_vp_index(msr_vp_index);
 
@@ -53,6 +55,11 @@ static int hv_cpu_init(unsigned int cpu)
if (msr_vp_index > hv_max_vp_index)
hv_max_vp_index = msr_vp_index;
 
+   /* Enable EL0 to access cntvct */
+   cntkctl = arch_timer_get_cntkctl();
+   cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN;
+   arch_timer_set_cntkctl(cntkctl);
+
return 0;
 }
 
-- 
2.24.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [RFC 0/6] vDSO support for Hyper-V guest on ARM64

2019-12-15 Thread Boqun Feng
Hi,

This is the RFC patchset for vDSO support in ARM64 Hyper-V guest. To
test it, Michael's ARM64 support patchset:


https://lore.kernel.org/linux-arm-kernel/1570129355-16005-1-git-send-email-mikel...@microsoft.com/

is needed.

Similar as x86, Hyper-V on ARM64 use a TSC page for guests to read
the virtualized hardware timer, this TSC page is read-only for the
guests, so could be used for vDSO data page. And the vDSO (userspace)
code could use the same code for timer reading as kernel, since
they read the same TSC page.

This patchset therefore extends ARM64's __vsdo_init() to allow multiple
data pages and introduces the vclock_mode concept similar to x86 to
allow different platforms (bare-metal, Hyper-V, etc.) to switch to
different __arch_get_hw_counter() implementations. The rest of this
patchset does the necessary setup for Hyper-V guests: mapping tsc page,
enabling userspace to read cntvct, etc. to enable vDSO.

This patchset consists of 6 patches:

patch #1 allows hv_get_raw_timer() definition to be overridden for
userspace and kernel to share the same hv_read_tsc_page() definition.

patch #2 extends ARM64 to support multiple vDSO data pages.

patch #3 introduces vclock_mode similiar to x86 to allow different
__arch_get_hw_counter() implementations for different clocksources.

patch #4 maps Hyper-V TSC page into vDSO data page.

patch #5 allows userspace to read cntvct, so that userspace can
efficiently read the clocksource.

patch #6 enables the vDSO for ARM64 Hyper-V guest.

The whole patchset is based on v5.5-rc1 plus Michael's ARM64 support
patchset, and I've done a few tests with:

https://github.com/nlynch-mentor/vdsotest

Comments and suggestions are welcome!

Regards,
Boqun

___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [RFC 6/6] arm64: hyperv: Enable vDSO

2019-12-15 Thread Boqun Feng
Similar to x86, add a new vclock_mode VCLOCK_HVCLOCK, and reuse the
hv_read_tsc_page() for userspace to read tsc page clocksource.

Signed-off-by: Boqun Feng (Microsoft) 
---
 arch/arm64/include/asm/clocksource.h   |  3 ++-
 arch/arm64/include/asm/mshyperv.h  |  2 +-
 arch/arm64/include/asm/vdso/gettimeofday.h | 19 +++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/clocksource.h 
b/arch/arm64/include/asm/clocksource.h
index fbe80057468c..c6acd45fe748 100644
--- a/arch/arm64/include/asm/clocksource.h
+++ b/arch/arm64/include/asm/clocksource.h
@@ -4,7 +4,8 @@
 
 #define VCLOCK_NONE0   /* No vDSO clock available. */
 #define VCLOCK_CNTVCT  1   /* vDSO should use cntvcnt  */
-#define VCLOCK_MAX 1
+#define VCLOCK_HVCLOCK 2   /* vDSO should use vread_hvclock()  */
+#define VCLOCK_MAX 2
 
 struct arch_clocksource_data {
int vclock_mode;
diff --git a/arch/arm64/include/asm/mshyperv.h 
b/arch/arm64/include/asm/mshyperv.h
index 0afb00e3501d..7c85dd816dca 100644
--- a/arch/arm64/include/asm/mshyperv.h
+++ b/arch/arm64/include/asm/mshyperv.h
@@ -90,7 +90,7 @@ extern void hv_get_vpreg_128(u32 reg, struct 
hv_get_vp_register_output *result);
 #define hv_set_reference_tsc(val) \
hv_set_vpreg(HV_REGISTER_REFERENCE_TSC, val)
 #define hv_set_clocksource_vdso(val) \
-   ((val).archdata.vclock_mode = VCLOCK_NONE)
+   ((val).archdata.vclock_mode = VCLOCK_HVCLOCK)
 
 #if IS_ENABLED(CONFIG_HYPERV)
 #define hv_enable_stimer0_percpu_irq(irq)  enable_percpu_irq(irq, 0)
diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h 
b/arch/arm64/include/asm/vdso/gettimeofday.h
index e6e3fe0488c7..7e689b903f4d 100644
--- a/arch/arm64/include/asm/vdso/gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/gettimeofday.h
@@ -67,6 +67,20 @@ int clock_getres_fallback(clockid_t _clkid, struct 
__kernel_timespec *_ts)
return ret;
 }
 
+#ifdef CONFIG_HYPERV_TIMER
+/* This will override the default hv_get_raw_timer() */
+#define hv_get_raw_timer() __arch_counter_get_cntvct()
+#include 
+
+extern struct ms_hyperv_tsc_page
+_hvclock_page __attribute__((visibility("hidden")));
+
+static u64 vread_hvclock(void)
+{
+   return hv_read_tsc_page(&_hvclock_page);
+}
+#endif
+
 static __always_inline u64 __arch_get_hw_counter(s32 clock_mode)
 {
u64 res;
@@ -78,6 +92,11 @@ static __always_inline u64 __arch_get_hw_counter(s32 
clock_mode)
if (clock_mode == VCLOCK_NONE)
return __VDSO_USE_SYSCALL;
 
+#ifdef CONFIG_HYPERV_TIMER
+   if (likely(clock_mode == VCLOCK_HVCLOCK))
+   return vread_hvclock();
+#endif
+
/*
 * This isb() is required to prevent that the counter value
 * is speculated.
-- 
2.24.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [RFC 4/6] arm64: vdso: hyperv: Map tsc page into vDSO if enabled

2019-12-15 Thread Boqun Feng
On Hyper-V, a tsc page has the data for adjusting cntvct numbers to
clocksource cycles, and that's how Hyper-V guest kernel reads the
clocksource. In order to allow userspace to read the same clocksource
directly, the tsc page has to been mapped into userspace via vDSO.

Use the framework for vDSO set-up in __vdso_init() to do this.

Note: if HYPERV_TIMER=y but the kernel is using other clocksource or
doesn't have the hyperv timer clocksource, tsc page will still be mapped
into userspace.

Signed-off-by: Boqun Feng (Microsoft) 
---
 arch/arm64/kernel/vdso.c  | 12 
 arch/arm64/kernel/vdso/vdso.lds.S | 12 +++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index b9b5ec7a3084..18a634987bdc 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -9,6 +9,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -105,14 +106,22 @@ static int __vdso_init(enum arch_vdso_type arch_index)
struct page **vdso_code_pagelist;
unsigned long nr_vdso_pages;
unsigned long pfn;
+   struct ms_hyperv_tsc_page *tsc_page;
+   int tsc_page_idx;
 
if (memcmp(vdso_lookup[arch_index].vdso_code_start, "\177ELF", 4)) {
pr_err("vDSO is not a valid ELF object!\n");
return -EINVAL;
}
 
+   /* One vDSO data page */
vdso_lookup[arch_index].nr_vdso_data_pages = 1;
 
+   /* Grab the Hyper-V tsc page, if enabled, add one more page */
+   tsc_page = hv_get_tsc_page();
+   if (tsc_page)
+   tsc_page_idx = vdso_lookup[arch_index].nr_vdso_data_pages++;
+
vdso_lookup[arch_index].nr_vdso_code_pages = (
vdso_lookup[arch_index].vdso_code_end -
vdso_lookup[arch_index].vdso_code_start) >>
@@ -130,6 +139,9 @@ static int __vdso_init(enum arch_vdso_type arch_index)
/* Grab the vDSO data page. */
vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data));
 
+   if (tsc_page)
+   vdso_pagelist[tsc_page_idx] = phys_to_page(__pa(tsc_page));
+
/* Grab the vDSO code pages. */
pfn = sym_to_pfn(vdso_lookup[arch_index].vdso_code_start);
 
diff --git a/arch/arm64/kernel/vdso/vdso.lds.S 
b/arch/arm64/kernel/vdso/vdso.lds.S
index 7ad2d3a0cd48..e40a1f5a6d30 100644
--- a/arch/arm64/kernel/vdso/vdso.lds.S
+++ b/arch/arm64/kernel/vdso/vdso.lds.S
@@ -17,7 +17,17 @@ OUTPUT_ARCH(aarch64)
 
 SECTIONS
 {
-   PROVIDE(_vdso_data = . - PAGE_SIZE);
+   /*
+* vdso data pages:
+*   vdso data (1 page)
+*   hv tsc page (1 page if enabled)
+*/
+   PROVIDE(_vdso_data = _hvclock_page - PAGE_SIZE);
+#ifdef CONFIG_HYPERV_TIMER
+   PROVIDE(_hvclock_page = . - PAGE_SIZE);
+#else
+   PROVIDE(_hvclock_page = .);
+#endif
. = VDSO_LBASE + SIZEOF_HEADERS;
 
.hash   : { *(.hash) }  :text
-- 
2.24.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [RFC 2/6] arm64: vdso: Add support for multiple vDSO data pages

2019-12-15 Thread Boqun Feng
Split __vdso_abi::vdso_pages into nr_vdso_{data,code}_pages, so that
__setup_additional_pages() could work with multiple vDSO data pages with
the setup from __vdso_init().

Multiple vDSO data pages are required when running in a virtualized
environment, where the cycles read from cntvct at userspace need to
be adjusted with some data from a page maintained by the hypervisor. For
example, the TSC page in Hyper-V.

This is a prerequisite for vDSO support in ARM64 on Hyper-V.

Signed-off-by: Boqun Feng (Microsoft) 
---
 arch/arm64/kernel/vdso.c | 43 
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 354b11e27c07..b9b5ec7a3084 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -50,7 +50,8 @@ struct __vdso_abi {
const char *name;
const char *vdso_code_start;
const char *vdso_code_end;
-   unsigned long vdso_pages;
+   unsigned long nr_vdso_data_pages;
+   unsigned long nr_vdso_code_pages;
/* Data Mapping */
struct vm_special_mapping *dm;
/* Code Mapping */
@@ -101,6 +102,8 @@ static int __vdso_init(enum arch_vdso_type arch_index)
 {
int i;
struct page **vdso_pagelist;
+   struct page **vdso_code_pagelist;
+   unsigned long nr_vdso_pages;
unsigned long pfn;
 
if (memcmp(vdso_lookup[arch_index].vdso_code_start, "\177ELF", 4)) {
@@ -108,14 +111,18 @@ static int __vdso_init(enum arch_vdso_type arch_index)
return -EINVAL;
}
 
-   vdso_lookup[arch_index].vdso_pages = (
+   vdso_lookup[arch_index].nr_vdso_data_pages = 1;
+
+   vdso_lookup[arch_index].nr_vdso_code_pages = (
vdso_lookup[arch_index].vdso_code_end -
vdso_lookup[arch_index].vdso_code_start) >>
PAGE_SHIFT;
 
-   /* Allocate the vDSO pagelist, plus a page for the data. */
-   vdso_pagelist = kcalloc(vdso_lookup[arch_index].vdso_pages + 1,
-   sizeof(struct page *),
+   nr_vdso_pages = vdso_lookup[arch_index].nr_vdso_data_pages +
+   vdso_lookup[arch_index].nr_vdso_code_pages;
+
+   /* Allocate the vDSO pagelist. */
+   vdso_pagelist = kcalloc(nr_vdso_pages, sizeof(struct page *),
GFP_KERNEL);
if (vdso_pagelist == NULL)
return -ENOMEM;
@@ -123,15 +130,17 @@ static int __vdso_init(enum arch_vdso_type arch_index)
/* Grab the vDSO data page. */
vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data));
 
-
/* Grab the vDSO code pages. */
pfn = sym_to_pfn(vdso_lookup[arch_index].vdso_code_start);
 
-   for (i = 0; i < vdso_lookup[arch_index].vdso_pages; i++)
-   vdso_pagelist[i + 1] = pfn_to_page(pfn + i);
+   vdso_code_pagelist = vdso_pagelist +
+vdso_lookup[arch_index].nr_vdso_data_pages;
+
+   for (i = 0; i < vdso_lookup[arch_index].nr_vdso_code_pages; i++)
+   vdso_code_pagelist[i] = pfn_to_page(pfn + i);
 
-   vdso_lookup[arch_index].dm->pages = _pagelist[0];
-   vdso_lookup[arch_index].cm->pages = _pagelist[1];
+   vdso_lookup[arch_index].dm->pages = vdso_pagelist;
+   vdso_lookup[arch_index].cm->pages = vdso_code_pagelist;
 
return 0;
 }
@@ -141,26 +150,26 @@ static int __setup_additional_pages(enum arch_vdso_type 
arch_index,
struct linux_binprm *bprm,
int uses_interp)
 {
-   unsigned long vdso_base, vdso_text_len, vdso_mapping_len;
+   unsigned long vdso_base, vdso_text_len, vdso_data_len;
void *ret;
 
-   vdso_text_len = vdso_lookup[arch_index].vdso_pages << PAGE_SHIFT;
-   /* Be sure to map the data page */
-   vdso_mapping_len = vdso_text_len + PAGE_SIZE;
+   vdso_data_len = vdso_lookup[arch_index].nr_vdso_data_pages << 
PAGE_SHIFT;
+   vdso_text_len = vdso_lookup[arch_index].nr_vdso_code_pages << 
PAGE_SHIFT;
 
-   vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0);
+   vdso_base = get_unmapped_area(NULL, 0,
+ vdso_data_len + vdso_text_len, 0, 0);
if (IS_ERR_VALUE(vdso_base)) {
ret = ERR_PTR(vdso_base);
goto up_fail;
}
 
-   ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE,
+   ret = _install_special_mapping(mm, vdso_base, vdso_data_len,
   VM_READ|VM_MAYREAD,
   vdso_lookup[arch_index].dm);
if (IS_ERR(ret))
goto up_fail;
 
-   vdso_base += PAGE_SIZE;
+   vdso_base += vdso_data_len;
mm->context.vdso = (void *)vdso_base;
ret = _install_special_mapping(mm, vdso_base, vdso_text_len,

[Xen-devel] [RFC 3/6] arm/arm64: clocksource: Introduce vclock_mode

2019-12-15 Thread Boqun Feng
Similar to x86, use a vclock_mode in arch_clocksource_data to differ
clocksoures use different read function in vDSO.

No functional changes, only preparation for support vDSO in ARM64 on
Hyper-V.

Note: the changes for arm are only because arm and arm64 share the same
code in the arch timer driver and require arch_clocksource_data having
the same field.

Signed-off-by: Boqun Feng (Microsoft) 
---
 arch/arm/include/asm/clocksource.h| 6 +-
 arch/arm/kernel/vdso.c| 1 -
 arch/arm64/include/asm/clocksource.h  | 6 +-
 arch/arm64/include/asm/mshyperv.h | 2 +-
 arch/arm64/include/asm/vdso/compat_gettimeofday.h | 5 +++--
 arch/arm64/include/asm/vdso/gettimeofday.h| 5 +++--
 arch/arm64/include/asm/vdso/vsyscall.h| 4 +---
 drivers/clocksource/arm_arch_timer.c  | 8 
 8 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/arch/arm/include/asm/clocksource.h 
b/arch/arm/include/asm/clocksource.h
index 0b350a7e26f3..017c5ab6e587 100644
--- a/arch/arm/include/asm/clocksource.h
+++ b/arch/arm/include/asm/clocksource.h
@@ -1,8 +1,12 @@
 #ifndef _ASM_CLOCKSOURCE_H
 #define _ASM_CLOCKSOURCE_H
 
+#define VCLOCK_NONE0   /* No vDSO clock available. */
+#define VCLOCK_CNTVCT  1   /* vDSO should use cntvcnt  */
+#define VCLOCK_MAX 1
+
 struct arch_clocksource_data {
-   bool vdso_direct;   /* Usable for direct VDSO access? */
+   int vclock_mode;
 };
 
 #endif
diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c
index c89ac1b9d28b..09e46ec420fe 100644
--- a/arch/arm/kernel/vdso.c
+++ b/arch/arm/kernel/vdso.c
@@ -263,4 +263,3 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long 
addr)
if (!IS_ERR(vma))
mm->context.vdso = addr;
 }
-
diff --git a/arch/arm64/include/asm/clocksource.h 
b/arch/arm64/include/asm/clocksource.h
index 0ece64a26c8c..fbe80057468c 100644
--- a/arch/arm64/include/asm/clocksource.h
+++ b/arch/arm64/include/asm/clocksource.h
@@ -2,8 +2,12 @@
 #ifndef _ASM_CLOCKSOURCE_H
 #define _ASM_CLOCKSOURCE_H
 
+#define VCLOCK_NONE0   /* No vDSO clock available. */
+#define VCLOCK_CNTVCT  1   /* vDSO should use cntvcnt  */
+#define VCLOCK_MAX 1
+
 struct arch_clocksource_data {
-   bool vdso_direct;   /* Usable for direct VDSO access? */
+   int vclock_mode;
 };
 
 #endif
diff --git a/arch/arm64/include/asm/mshyperv.h 
b/arch/arm64/include/asm/mshyperv.h
index 9cc4aeddf2d0..0afb00e3501d 100644
--- a/arch/arm64/include/asm/mshyperv.h
+++ b/arch/arm64/include/asm/mshyperv.h
@@ -90,7 +90,7 @@ extern void hv_get_vpreg_128(u32 reg, struct 
hv_get_vp_register_output *result);
 #define hv_set_reference_tsc(val) \
hv_set_vpreg(HV_REGISTER_REFERENCE_TSC, val)
 #define hv_set_clocksource_vdso(val) \
-   ((val).archdata.vdso_direct = false)
+   ((val).archdata.vclock_mode = VCLOCK_NONE)
 
 #if IS_ENABLED(CONFIG_HYPERV)
 #define hv_enable_stimer0_percpu_irq(irq)  enable_percpu_irq(irq, 0)
diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h 
b/arch/arm64/include/asm/vdso/compat_gettimeofday.h
index c50ee1b7d5cd..630d04c3c92e 100644
--- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h
@@ -8,6 +8,7 @@
 #ifndef __ASSEMBLY__
 
 #include 
+#include 
 #include 
 
 #include 
@@ -117,10 +118,10 @@ static __always_inline u64 __arch_get_hw_counter(s32 
clock_mode)
u64 res;
 
/*
-* clock_mode == 0 implies that vDSO are enabled otherwise
+* clock_mode == VCLOCK_NONE implies that vDSO are disabled so
 * fallback on syscall.
 */
-   if (clock_mode)
+   if (clock_mode == VCLOCK_NONE)
return __VDSO_USE_SYSCALL;
 
/*
diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h 
b/arch/arm64/include/asm/vdso/gettimeofday.h
index b08f476b72b4..e6e3fe0488c7 100644
--- a/arch/arm64/include/asm/vdso/gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/gettimeofday.h
@@ -8,6 +8,7 @@
 #ifndef __ASSEMBLY__
 
 #include 
+#include 
 #include 
 
 #define __VDSO_USE_SYSCALL ULLONG_MAX
@@ -71,10 +72,10 @@ static __always_inline u64 __arch_get_hw_counter(s32 
clock_mode)
u64 res;
 
/*
-* clock_mode == 0 implies that vDSO are enabled otherwise
+* clock_mode == VCLOCK_NONE implies that vDSO are disabled so
 * fallback on syscall.
 */
-   if (clock_mode)
+   if (clock_mode == VCLOCK_NONE)
return __VDSO_USE_SYSCALL;
 
/*
diff --git a/arch/arm64/include/asm/vdso/vsyscall.h 
b/arch/arm64/include/asm/vdso/vsyscall.h
index 0c20a7c1bee5..07f78b0da498 100644
--- a/arch/arm64/include/asm/vdso/vsyscall.h
+++ b/arch/arm64/include/asm/vdso/vsyscall.h
@@ -24,9 +24,7 @@ struct vdso_data *__arm64_get_k_vdso_data(void)
 static __always_inl

[Xen-devel] [RFC 1/6] arm64: hyperv: Allow hv_get_raw_timer() definition to be overridden

2019-12-15 Thread Boqun Feng
In order to support vDSO, hv_read_tsc_page() should be able to be called
from userspace if tsc page mapped. As a result, hv_get_raw_timer(),
called by hv_read_tsc_page() requires to be called by both kernel and
vDSO. Currently, it's defined as arch_timer_read_counter(), which is a
function pointer initialized (using a kernel address) by the arch timer
driver, therefore not usable in vDSO.

Fix this by allowing a previous definition to override the default one,
so that in vDSO code, we can define it as a function callable in
userspace.

Signed-off-by: Boqun Feng (Microsoft) 
---
 arch/arm64/include/asm/mshyperv.h | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/mshyperv.h 
b/arch/arm64/include/asm/mshyperv.h
index a8468a611912..9cc4aeddf2d0 100644
--- a/arch/arm64/include/asm/mshyperv.h
+++ b/arch/arm64/include/asm/mshyperv.h
@@ -97,8 +97,15 @@ extern void hv_get_vpreg_128(u32 reg, struct 
hv_get_vp_register_output *result);
 #define hv_disable_stimer0_percpu_irq(irq) disable_percpu_irq(irq)
 #endif
 
-/* ARM64 specific code to read the hardware clock */
+/*
+ * ARM64 specific code to read the hardware clock.
+ *
+ * This could be used in both kernel space and userspace (vDSO), so make it
+ * possible for a previous definition to override the default one.
+ */
+#ifndef hv_get_raw_timer
 #define hv_get_raw_timer() arch_timer_read_counter()
+#endif
 
 #include 
 
-- 
2.24.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [RFC PATCH v2 00/17] RFC: SGX Virtualization design and draft patches

2017-12-24 Thread Boqun Feng
On Mon, Dec 04, 2017 at 08:15:11AM +0800, Boqun Feng wrote:
> Hi all,
> 
> This is the v2 of RFC SGX Virtualization design and draft patches, you

Ping ;-)

Any comments?

Regards,
Boqun

> can find v1 at:
> 
> https://lists.gt.net/xen/devel/483404
> 
> In the new version, I fix a few things according to the feedbacks for
> previous version(mostly are cleanups and code movement).
> 
> Besides, Kai and I redesign the SGX MSRs setting up part and introduce
> new XL parameter 'lehash' and 'lewr'.
> 
> Another big change is that I modify the EPC management to fit EPC pages
> in 'struct page_info', and in patch #6 and #7, unscrubbable pages,
> 'PGC_epc', 'MEMF_epc' and 'XENZONE_EPC' are introduced, so that EPC
> management is fully integrated into existing memory management of xen.
> This might be the controversial bit, so patch 6~8 are simply to show the
> idea and drive deep discussion.
> 
> Detailed changes since v1: (modifications with tag "[New]" is totally
> new in this series, reviews and comments are highly welcome for those
> parts)
> 
> *   Make SGX related mostly common for x86 by: 1) moving sgx.[ch] to
> arch/x86/ and include/asm-x86/ and 2) renaming EPC related functions
> with domain_* prefix.
> 
> *   Rename ioremap_cache() with ioremap_wb() and make it x86-specific as
> suggested by Jan Beulich.
> 
> *   Remove percpu sgx_cpudata, during bootup secondary CPUs now check
> whether they read different value than boot CPU, if so SGX is
> disabled.
> 
> *   Remove domain_has_sgx_{,launch_control}, and make sure we can
> rely on domain's arch.cpuid->feat.sgx{_lc} for setting checks.
> 
> *   Cleanup the code for CPUID handling as suggested by Andrew Cooper.
> 
> *   Adjust to msr_policy framework for SGX MSRs handling, and remove
> unnecessary fields like 'readable' and 'writable'
> 
> *   Use 'page_info' to maintain EPC pages, and [NEW] add an draft
> implementation for employing xenheap for EPC page management. Please
> see patch 6~8
> 
> *   [New] Modify the XL parameter for SGX, please see section 2.1.1 in
> the updated design doc. 
> 
> *   [New] Use _set_vcpu_msrs hypercall in the toolstack to set the SGX
> related. Please see patch #17.
> 
> *   ACPI related tool changes are temporarily dropped in this patchset,
> as I need more time to resolve the comments and do related tests.
> 
> And the update design doc is as follow, as the previous version in the
> design there are some particualr points that we don't know which
> implementation is better. For those a question mark (?) is added at the
> right of the menu. And for SGX live migration, thanks to Wei Liu for
> providing comments that it's nice to support if we can in previous
> version review, but we'd like hear more from you guys so we still put a
> question mark fot this item. Your comments on those "question mark (?)"
> parts (and other comments as well, of course) are highly appreciated.
> 
> ===
> 1. SGX Introduction
> 1.1 Overview
> 1.1.1 Enclave
> 1.1.2 EPC (Enclave Paage Cache)
> 1.1.3 ENCLS and ENCLU
> 1.2 Discovering SGX Capability
> 1.2.1 Enumerate SGX via CPUID
> 1.2.2 Intel SGX Opt-in Configuration
> 1.3 Enclave Life Cycle
> 1.3.1 Constructing & Destroying Enclave
> 1.3.2 Enclave Entry and Exit
> 1.3.2.1 Synchonous Entry and Exit
> 1.3.2.2 Asynchounous Enclave Exit
> 1.3.3 EPC Eviction and Reload
> 1.4 SGX Launch Control
> 1.5 SGX Interaction with IA32 and IA64 Architecture
> 2. SGX Virtualization Design
> 2.1 High Level Toolstack Changes
> 2.1.1 New 'sgx' XL configure file parameter
> 2.1.2 New XL commands (?)
> 2.1.3 Notify domain's virtual EPC base and size to Xen
> 2.2 High Level Hypervisor Changes
> 2.2.1 EPC Management
> 2.2.2 EPC Virtualization
> 2.2.3 Populate EPC for Guest
> 2.2.4 Launch Control Support
> 2.2.5 CPUID Emulation
> 2.2.6 EPT Violation & ENCLS Trapping Handling
> 2.2.7 Guest Suspend & Resume
> 2.2.8 Destroying Domain
> 2.3 Additional Point: Live Migration, Snapshot Support (?)
> 3. Reference
> 
> 1. SGX Introduction
> 
> 1.1 Overview
> 
> 1.1.1 Enclave
> 
> Intel Software Guard Extensions (SGX) is a set of instructions and mechanisms
> for memory accesses in order to provide security accesses for sensitive
> applications and data. SGX allows an application to use it's pariticular 

[Xen-devel] [PATCH v2 16/17] xen: tools: add SGX to applying CPUID policy

2017-12-03 Thread Boqun Feng
From: Kai Huang 

In libxc, a new structure 'xc_cpuid_policy_build_info_t' is added to carry
domain's EPC base and size info from libxl. libxl_cpuid_apply_policy is also
changed to take 'libxl_domain_build_info_t' as parameter, where domain's EPC
base and size can be got and passed to xc_cpuid_apply_policy.
xc_cpuid_apply_policy is extended to support SGX CPUID. If hypervisor doesn't
report SGX feature in host type cpufeatureset, then using 'epc' parameter
results in domain creation failure as SGX cannot be supported.

Signed-off-by: Kai Huang 
---
 tools/libxc/include/xenctrl.h   | 14 
 tools/libxc/xc_cpuid_x86.c  | 68 ++---
 tools/libxl/libxl.h |  3 +-
 tools/libxl/libxl_cpuid.c   | 15 ++--
 tools/libxl/libxl_dom.c |  6 +++-
 tools/libxl/libxl_nocpuid.c |  4 ++-
 tools/ocaml/libs/xc/xenctrl_stubs.c | 11 +-
 tools/python/xen/lowlevel/xc/xc.c   | 11 +-
 8 files changed, 121 insertions(+), 11 deletions(-)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 666db0b9193e..ad4429ca5ffd 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -1827,6 +1827,19 @@ int xc_domain_debug_control(xc_interface *xch,
 uint32_t vcpu);
 
 #if defined(__i386__) || defined(__x86_64__)
+typedef struct xc_cpuid_policy_build_info_sgx {
+uint64_t epc_base;
+uint64_t epc_size;
+} xc_cpuid_policy_build_info_sgx_t;
+
+typedef struct xc_cpuid_policy_build_info {
+xc_cpuid_policy_build_info_sgx_t sgx;
+} xc_cpuid_policy_build_info_t;
+
+int xc_cpuid_check(xc_interface *xch,
+   const unsigned int *input,
+   const char **config,
+   char **config_transformed);
 int xc_cpuid_set(xc_interface *xch,
  uint32_t domid,
  const unsigned int *input,
@@ -1834,6 +1847,7 @@ int xc_cpuid_set(xc_interface *xch,
  char **config_transformed);
 int xc_cpuid_apply_policy(xc_interface *xch,
   uint32_t domid,
+  xc_cpuid_policy_build_info_t *b_info,
   uint32_t *featureset,
   unsigned int nr_features);
 void xc_cpuid_to_str(const unsigned int *regs,
diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c
index 25b922ea2184..a778acf79a64 100644
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -38,7 +38,7 @@ enum {
 #define clear_feature(idx, dst) ((dst) &= ~bitmaskof(idx))
 #define set_feature(idx, dst)   ((dst) |=  bitmaskof(idx))
 
-#define DEF_MAX_BASE 0x000du
+#define DEF_MAX_BASE 0x0012u
 #define DEF_MAX_INTELEXT  0x8008u
 #define DEF_MAX_AMDEXT0x801cu
 
@@ -178,6 +178,8 @@ struct cpuid_domain_info
 /* HVM-only information. */
 bool pae;
 bool nestedhvm;
+
+xc_cpuid_policy_build_info_t *b_info;
 };
 
 static void cpuid(const unsigned int *input, unsigned int *regs)
@@ -369,6 +371,12 @@ static void intel_xc_cpuid_policy(xc_interface *xch,
   const struct cpuid_domain_info *info,
   const unsigned int *input, unsigned int 
*regs)
 {
+xc_cpuid_policy_build_info_t *b_info = info->b_info;
+xc_cpuid_policy_build_info_sgx_t *sgx = NULL;
+
+if ( b_info )
+sgx = _info->sgx;
+
 switch ( input[0] )
 {
 case 0x0004:
@@ -381,6 +389,30 @@ static void intel_xc_cpuid_policy(xc_interface *xch,
 regs[3] &= 0x3ffu;
 break;
 
+case 0x0012:
+if ( !sgx ) {
+regs[0] = regs[1] = regs[2] = regs[3] = 0;
+break;
+}
+
+if ( !sgx->epc_base || !sgx->epc_size ) {
+regs[0] = regs[1] = regs[2] = regs[3] = 0;
+break;
+}
+
+if ( input[1] == 2 ) {
+/*
+ * FIX EPC base and size for SGX CPUID leaf 2. Xen hypervisor is
+ * depending on XEN_DOMCTL_set_cpuid to know domain's EPC base
+ * and size.
+ */
+regs[0] = (uint32_t)(sgx->epc_base & 0xf000) | 0x1;
+regs[1] = (uint32_t)(sgx->epc_base >> 32);
+regs[2] = (uint32_t)(sgx->epc_size & 0xf000) | 0x1;
+regs[3] = (uint32_t)(sgx->epc_size >> 32);
+}
+break;
+
 case 0x8000:
 if ( regs[0] > DEF_MAX_INTELEXT )
 regs[0] = DEF_MAX_INTELEXT;
@@ -444,6 +476,10 @@ static void xc_cpuid_hvm_policy(xc_interface *xch,
 regs[1] = regs[2] = regs[3] = 0;
 break;
 
+case 0x0012:
+/* Intel SGX. Passthrough to Intel function */
+break;
+
 case 0x8000:
 /* Passthrough to cpu vendor specific functions */
 break;
@@ -649,12 +685,13 @@ void xc_cpuid_to_str(const unsigned int *regs, char 
**strs)
 }
 }
 
-static void 

[Xen-devel] [PATCH v2 17/17] xen: tools: add SGX to applying MSR policy

2017-12-03 Thread Boqun Feng
In libxc, a new function 'xc_msr_sgx_set' is added, this function will
apply SGX related MSR policy to the target domain. This function takes
the value of 'lewr' and 'lehash*' in 'libxl_sgx_buildinfo', and set
the proper MSRs in all vcpus via 'XEN_DOMCTL_set_vcpu_msrs' hypercall.

If the physical IA32_SGXLEPUBKEYHASHn MSRs are writable:

* Domain's IA32_FEATURE_CONTROL_SGX_LE_WR bit depends on 'lwer'(default
  false)

* If 'lehash' is unset, do nothing, as we already set the proper value
  in sgx_domain_msr_init().

* If 'lehash' is set, set the domain's virtual IA32_SGXLEPUBKEYHASHn
  with its value, and later on the vcpu's virtual IA32_SGXLEPUBKEYHASHn
  will be set with the same value.

If the physical IA32_SGXLEPUBKEYHASHn MSRs are not writable, using
'lehash' or 'lewr' parameter results in domain creation failure.

Signed-off-by: Boqun Feng <boqun.f...@intel.com>
---
 tools/libxc/Makefile  |  1 +
 tools/libxc/include/xenctrl.h |  2 ++
 tools/libxc/xc_msr_x86.h  | 10 ++
 tools/libxc/xc_sgx.c  | 82 +++
 tools/libxl/libxl_dom.c   | 29 +++
 tools/xl/xl_parse.c   | 10 ++
 6 files changed, 134 insertions(+)
 create mode 100644 tools/libxc/xc_sgx.c

diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile
index 9a019e8dfed5..428430a15c40 100644
--- a/tools/libxc/Makefile
+++ b/tools/libxc/Makefile
@@ -41,6 +41,7 @@ CTRL_SRCS-y   += xc_foreign_memory.c
 CTRL_SRCS-y   += xc_kexec.c
 CTRL_SRCS-y   += xc_resource.c
 CTRL_SRCS-$(CONFIG_X86) += xc_psr.c
+CTRL_SRCS-$(CONFIG_X86) += xc_sgx.c
 CTRL_SRCS-$(CONFIG_X86) += xc_pagetab.c
 CTRL_SRCS-$(CONFIG_Linux) += xc_linux.c
 CTRL_SRCS-$(CONFIG_FreeBSD) += xc_freebsd.c
diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index ad4429ca5ffd..abc9f711141a 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -1855,6 +1855,8 @@ void xc_cpuid_to_str(const unsigned int *regs,
 int xc_mca_op(xc_interface *xch, struct xen_mc *mc);
 int xc_mca_op_inject_v2(xc_interface *xch, unsigned int flags,
 xc_cpumap_t cpumap, unsigned int nr_cpus);
+int xc_msr_sgx_set(xc_interface *xch, uint32_t domid, bool lewr,
+   uint64_t *lehash, int max_vcpu);
 #endif
 
 struct xc_px_val {
diff --git a/tools/libxc/xc_msr_x86.h b/tools/libxc/xc_msr_x86.h
index 7f100e71a7a1..54eaa4de8945 100644
--- a/tools/libxc/xc_msr_x86.h
+++ b/tools/libxc/xc_msr_x86.h
@@ -24,6 +24,16 @@
 #define MSR_IA32_CMT_EVTSEL 0x0c8d
 #define MSR_IA32_CMT_CTR0x0c8e
 
+#define MSR_IA32_FEATURE_CONTROL   0x003a
+#define IA32_FEATURE_CONTROL_LOCK 0x0001
+#define IA32_FEATURE_CONTROL_SGX_ENABLE   0x4
+#define IA32_FEATURE_CONTROL_SGX_LE_WR0x2
+
+#define MSR_IA32_SGXLEPUBKEYHASH0   0x008c
+#define MSR_IA32_SGXLEPUBKEYHASH1   0x008d
+#define MSR_IA32_SGXLEPUBKEYHASH2   0x008e
+#define MSR_IA32_SGXLEPUBKEYHASH3   0x008f
+
 #endif
 
 /*
diff --git a/tools/libxc/xc_sgx.c b/tools/libxc/xc_sgx.c
new file mode 100644
index ..8f97ca0042e0
--- /dev/null
+++ b/tools/libxc/xc_sgx.c
@@ -0,0 +1,82 @@
+/*
+ * xc_sgx.c
+ *
+ * SGX related MSR setup
+ *
+ * Copyright (C) 2017  Intel Corporation
+ * Author Boqun Feng <boqun.f...@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include 
+#include "xc_private.h"
+#include "xc_msr_x86.h"
+
+int xc_msr_sgx_set(xc_interface *xch, uint32_t domid, bool lewr,
+   uint64_t *lehash, int max_vcpu)
+{
+int rc, i, nr_msrs;
+DECLARE_DOMCTL;
+xen_domctl_vcpu_msr_t sgx_msrs[5];
+DECLARE_HYPERCALL_BUFFER(void, buffer);
+
+if ( !lehash && !lewr )
+return 0;
+
+sgx_msrs[0].index = MSR_IA32_FEATURE_CONTROL;
+sgx_msrs[0].reserved = 0;
+sgx_msrs[0].value = IA32_FEATURE_CONTROL_LOCK |
+IA32_FEATURE_CONTROL_SGX_ENABLE |
+(lewr ? IA32_FEATURE_CONTROL_SGX_LE_WR : 0);
+
+if ( !lehash )
+nr_msrs = 1;
+else
+{
+nr_msrs = 5;
+
+for ( i = 0; i < 4; i++ )
+{
+sgx_msrs[i+1].index = MSR_IA32_SGXLEPUBKEYHASH0 + i;
+sgx_msrs[i+1].reserved = 0;
+sgx_msrs[i+1].value = lehash[i];
+}
+}
+
+buffer = xc_hypercall_buffer_alloc(xch, buffer,
+   

[Xen-devel] [PATCH v2 11/17] xen: vmx: handle SGX related MSRs

2017-12-03 Thread Boqun Feng
From: Kai Huang <kai.hu...@linux.intel.com>

This patch handles IA32_FEATURE_CONTROL and IA32_SGXLEPUBKEYHASHn MSRs.

For IA32_FEATURE_CONTROL, if SGX is exposed to domain, then SGX_ENABLE
bit is always set. The SGX_LE_WR bit is default to be 0, unless 1) the
SGX launch control is exposed to domain and 2) the XL parameter 'lewr'
is true(the handling of this parameter is in a later patch, so for this
patch, SGX_LE_WR bit is always 0).  Write to IA32_FEATURE_CONTROL will
fault.

For IA32_SGXLEPUBKEYHASHn, vcpu's virtual ia32_sgxlepubkeyhash[0-3] are
added in 'sgx' field of 'struct msr_vcpu_policy'.

During vcpu is initialized, virtual ia32_sgxlepubkeyhash are also
initialized. The default values would be the physical values of the
physical machines. Later on, we may reset those values with the content
of the XL parameter 'lehash'. Besides if 'lewr' is true and no 'lehash'
is provided, we will reset those values with Intel's default value, as
for physical machines, those MSRs will have Intel's default value.

For IA32_SGXLEPUBKEYHASHn MSR read from guest, if SGX launch control is
not exposed to domain, guest is not allowed to read either, otherwise
vcpu's virtual MSR value is returned.

For IA32_SGXLEPUBKEYHASHn MSR write from guest, we allow guest to write
if only 'lewr' is set(so for this patch, writes will fault).

To make EINIT run successfully in guest, vcpu's virtual
IA32_SGXLEPUBKEYHASHn will be update to physical MSRs when vcpu is
scheduled in. Moreover, we cache the recent IA32_SGXLEPUBKEYHASHn in a
percpu variable, so that we won't need to update with wrmsr if the value
not changed.

Signed-off-by: Kai Huang <kai.hu...@linux.intel.com>
Signed-off-by: Boqun Feng <boqun.f...@intel.com>
---
 xen/arch/x86/domctl.c|  28 -
 xen/arch/x86/hvm/vmx/vmx.c   |  19 ++
 xen/arch/x86/msr.c   |   6 +-
 xen/arch/x86/sgx.c   | 123 +++
 xen/include/asm-x86/cpufeature.h |   3 +
 xen/include/asm-x86/msr-index.h  |   5 ++
 xen/include/asm-x86/msr.h|   5 ++
 xen/include/asm-x86/sgx.h|   9 +++
 8 files changed, 196 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index 0ee9fb6458ec..eb5d4b346313 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -1352,13 +1352,16 @@ long arch_do_domctl(
 
 ret = -EINVAL;
 if ( (v == curr) || /* no vcpu_pause() */
- !is_pv_domain(d) )
+ (!is_pv_domain(d) && !d->arch.cpuid->feat.sgx_lc) )
 break;
 
 /* Count maximum number of optional msrs. */
 if ( boot_cpu_has(X86_FEATURE_DBEXT) )
 nr_msrs += 4;
 
+if ( d->arch.cpuid->feat.sgx_lc )
+nr_msrs += 5;
+
 if ( domctl->cmd == XEN_DOMCTL_get_vcpu_msrs )
 {
 ret = 0; copyback = true;
@@ -1447,6 +1450,29 @@ long arch_do_domctl(
 msr.index -= MSR_AMD64_DR1_ADDRESS_MASK - 1;
 v->arch.pv_vcpu.dr_mask[msr.index] = msr.value;
 continue;
+case MSR_IA32_FEATURE_CONTROL:
+if ( msr.value & IA32_FEATURE_CONTROL_SGX_LE_WR )
+{
+if ( d->arch.cpuid->feat.sgx_lc && sgx_lewr())
+{
+v->arch.msr->sgx.lewr = true;
+continue;
+}
+else /* Try to set LE_WR while not supported */
+break;
+}
+   continue;
+case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+if ( d->arch.cpuid->feat.sgx_lc && sgx_lewr() )
+{
+sgx_set_vcpu_sgxlepubkeyhash(v,
+msr.index - MSR_IA32_SGXLEPUBKEYHASH0,
+msr.value);
+continue;
+}
+else
+break;
+   continue;
 }
 break;
 }
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 92fb85b13a0c..ce1c95f69062 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -1049,6 +1049,9 @@ static void vmx_ctxt_switch_to(struct vcpu *v)
 
 if ( v->domain->arch.hvm_domain.pi_ops.switch_to )
 v->domain->arch.hvm_domain.pi_ops.switch_to(v);
+
+if ( v->domain->arch.cpuid->feat.sgx_lc && sgx_lewr() )
+sgx_ctxt_switch_to(v);
 }
 
 
@@ -2892,6 +2895,8 @@ static int is_last_branch_msr(u32 ecx)
 static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
 {
 const struct vcpu *curr = current;
+const struct msr_vcpu_policy *vp = curr->arch.msr;
+const struct do

[Xen-devel] [PATCH v2 04/17] xen: x86/mm: introduce ioremap_wb()

2017-12-03 Thread Boqun Feng
From: Kai Huang <kai.hu...@linux.intel.com>

Currently Xen only has non-cacheable version of ioremap for x86.
Although EPC is reported as reserved memory in e820 but it can be mapped
as cacheable.  This patch introduces ioremap_wb() (ioremap for cacheable
and write back memory).

Signed-off-by: Kai Huang <kai.hu...@linux.intel.com>
Signed-off-by: Boqun Feng <boqun.f...@intel.com>
---
 xen/arch/x86/mm.c| 9 +++--
 xen/include/asm-x86/mm.h | 7 +++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 886a5ee327df..db1d1f40 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5207,7 +5207,7 @@ void *__init arch_vmap_virt_end(void)
 return (void *)fix_to_virt(__end_of_fixed_addresses);
 }
 
-void __iomem *ioremap(paddr_t pa, size_t len)
+void __iomem *__ioremap(paddr_t pa, size_t len, unsigned int flags)
 {
 mfn_t mfn = _mfn(PFN_DOWN(pa));
 void *va;
@@ -5222,12 +5222,17 @@ void __iomem *ioremap(paddr_t pa, size_t len)
 unsigned int offs = pa & (PAGE_SIZE - 1);
 unsigned int nr = PFN_UP(offs + len);
 
-va = __vmap(, nr, 1, 1, PAGE_HYPERVISOR_UCMINUS, VMAP_DEFAULT) + 
offs;
+va = __vmap(, nr, 1, 1, flags, VMAP_DEFAULT) + offs;
 }
 
 return (void __force __iomem *)va;
 }
 
+void __iomem *ioremap(paddr_t pa, size_t len)
+{
+return __ioremap(pa, len, PAGE_HYPERVISOR_UCMINUS);
+}
+
 int create_perdomain_mapping(struct domain *d, unsigned long va,
  unsigned int nr, l1_pgentry_t **pl1tab,
  struct page_info **ppg)
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 83626085e0a6..77e3c3ba68d1 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -629,4 +629,11 @@ static inline bool arch_mfn_in_directmap(unsigned long mfn)
 return mfn <= (virt_to_mfn(eva - 1) + 1);
 }
 
+extern void __iomem *__ioremap(paddr_t, size_t, unsigned int);
+
+static inline void __iomem *ioremap_wb(paddr_t pa, size_t len)
+{
+return __ioremap(pa, len, PAGE_HYPERVISOR);
+}
+
 #endif /* __ASM_X86_MM_H__ */
-- 
2.15.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v2 12/17] xen: vmx: handle ENCLS VMEXIT

2017-12-03 Thread Boqun Feng
From: Kai Huang 

Currently EPC are statically allocated and mapped to guest, we don't have
to trap ENCLS as it runs perfectly in VMX non-root mode. But exposing SGX
to guest means we also expose ENABLE_ENCLS bit to L1 hypervisor, therefore
we cannot stop L1 from enabling ENCLS VMEXIT. For ENCLS VMEXIT from L2 guest,
we simply inject it to L1, otherwise the ENCLS VMEXIT is unexpected in L0
and we simply crash the domain.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/hvm/vmx/vmx.c | 10 ++
 xen/arch/x86/hvm/vmx/vvmx.c| 11 +++
 xen/include/asm-x86/hvm/vmx/vmcs.h |  1 +
 xen/include/asm-x86/hvm/vmx/vmx.h  |  1 +
 4 files changed, 23 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index ce1c95f69062..c48c44565fc5 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4118,6 +4118,16 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
 vmx_handle_apic_write();
 break;
 
+case EXIT_REASON_ENCLS:
+/*
+ * Currently L0 doesn't turn on ENCLS VMEXIT, but L0 cannot stop L1
+ * from enabling ENCLS VMEXIT. ENCLS VMEXIT from L2 guest has already
+ * been handled so by reaching here it is a BUG. We simply crash the
+ * domain.
+ */
+domain_crash(v->domain);
+break;
+
 case EXIT_REASON_PML_FULL:
 vmx_vcpu_flush_pml_buffer(v);
 break;
diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
index dde02c076b9f..9c6123dc35ee 100644
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -2094,6 +2094,12 @@ int nvmx_msr_read_intercept(unsigned int msr, u64 
*msr_content)
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_ENABLE_EPT;
+/*
+ * If SGX is exposed to guest, then ENABLE_ENCLS bit must also be
+ * exposed to guest.
+ */
+if ( d->arch.cpuid->feat.sgx )
+data |= SECONDARY_EXEC_ENABLE_ENCLS;
 data = gen_vmx_msr(data, 0, host_data);
 break;
 case MSR_IA32_VMX_EXIT_CTLS:
@@ -2316,6 +2322,11 @@ int nvmx_n2_vmexit_handler(struct cpu_user_regs *regs,
 case EXIT_REASON_VMXON:
 case EXIT_REASON_INVEPT:
 case EXIT_REASON_XSETBV:
+/*
+ * L0 doesn't turn on ENCLS VMEXIT now, so ENCLS VMEXIT must come from
+ * L2 guest, and is because of ENCLS VMEXIT is turned on by L1.
+ */
+case EXIT_REASON_ENCLS:
 /* inject to L1 */
 nvcpu->nv_vmexit_pending = 1;
 break;
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h 
b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 44ff4f0a113f..f68f3d0f6801 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -407,6 +407,7 @@ enum vmcs_field {
 VIRT_EXCEPTION_INFO = 0x202a,
 XSS_EXIT_BITMAP = 0x202c,
 TSC_MULTIPLIER  = 0x2032,
+ENCLS_EXITING_BITMAP= 0x202E,
 GUEST_PHYSICAL_ADDRESS  = 0x2400,
 VMCS_LINK_POINTER   = 0x2800,
 GUEST_IA32_DEBUGCTL = 0x2802,
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h 
b/xen/include/asm-x86/hvm/vmx/vmx.h
index 7341cb191ef2..8547de9168eb 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -215,6 +215,7 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc)
 #define EXIT_REASON_APIC_WRITE  56
 #define EXIT_REASON_INVPCID 58
 #define EXIT_REASON_VMFUNC  59
+#define EXIT_REASON_ENCLS   60
 #define EXIT_REASON_PML_FULL62
 #define EXIT_REASON_XSAVES  63
 #define EXIT_REASON_XRSTORS 64
-- 
2.15.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v2 08/17] xen: x86/mm: add SGX EPC management

2017-12-03 Thread Boqun Feng
As now the heap allocator supports EPC pages, the management of EPC
pages is simply putting EPC pages into the heap at booting up if SGX is
supported and the EPC section is reported consistently. Allocation and
reclamation are just heap allocation and reclamation with MEMF_epc.

One more thing we need to do is to populate the portion of EPC pages in
the 'frame_table' and set up the mapping properly.

SGX would be disabled, if EPC initialization found any problem.

Signed-off-by: Boqun Feng <boqun.f...@intel.com>
---
 xen/arch/x86/sgx.c| 161 ++
 xen/include/asm-x86/sgx.h |   3 +
 2 files changed, 164 insertions(+)

diff --git a/xen/arch/x86/sgx.c b/xen/arch/x86/sgx.c
index ead917543f3e..9409b041e4f7 100644
--- a/xen/arch/x86/sgx.c
+++ b/xen/arch/x86/sgx.c
@@ -22,6 +22,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 struct sgx_cpuinfo __read_mostly boot_sgx_cpudata;
@@ -29,6 +31,13 @@ struct sgx_cpuinfo __read_mostly boot_sgx_cpudata;
 static bool __read_mostly opt_sgx_enabled = false;
 boolean_param("sgx", opt_sgx_enabled);
 
+#define total_epc_npages (boot_sgx_cpudata.epc_size >> PAGE_SHIFT)
+#define epc_base_mfn (boot_sgx_cpudata.epc_base >> PAGE_SHIFT)
+#define epc_base_maddr (boot_sgx_cpudata.epc_base)
+#define epc_end_maddr (epc_base_maddr + boot_sgx_cpudata.epc_size)
+
+static void *epc_base_vaddr = NULL;
+
 static void __detect_sgx(struct sgx_cpuinfo *sgxinfo)
 {
 u32 eax, ebx, ecx, edx;
@@ -166,11 +175,163 @@ static void __init print_sgx_cpuinfo(struct sgx_cpuinfo 
*sgxinfo)
boot_sgx_cpudata.epc_base + boot_sgx_cpudata.epc_size);
 }
 
+struct ft_page {
+struct page_info *pg;
+unsigned int order;
+unsigned long idx;
+struct list_head list;
+};
+
+static int extend_epc_frametable(unsigned long smfn, unsigned long emfn)
+{
+unsigned long idx;
+LIST_HEAD(ft_pages);
+struct ft_page *ftp, *nftp;
+int rc = 0;
+
+for ( ; smfn < emfn; smfn += PDX_GROUP_COUNT )
+{
+idx = pfn_to_pdx(smfn) / PDX_GROUP_COUNT;
+
+if (!test_bit(idx, pdx_group_valid))
+{
+unsigned long s = (unsigned long)pdx_to_page(idx * 
PDX_GROUP_COUNT);
+struct page_info *pg;
+
+ftp = xzalloc(struct ft_page);
+
+if ( !ftp )
+{
+rc = -ENOMEM;
+goto out;
+}
+
+pg = alloc_domheap_pages(NULL, PDX_GROUP_SHIFT - PAGE_SHIFT, 0);
+
+if ( !pg )
+{
+xfree(ftp);
+rc = -ENOMEM;
+goto out;
+}
+
+ftp->order = PDX_GROUP_SHIFT - PAGE_SHIFT;
+ftp->pg = pg;
+ftp->idx = idx;
+
+list_add_tail(>list, _pages);
+
+map_pages_to_xen(s, page_to_mfn(pg),
+ 1UL << (PDX_GROUP_SHIFT - PAGE_SHIFT),
+ PAGE_HYPERVISOR);
+memset((void *)s, 0, sizeof(struct page_info) * PDX_GROUP_COUNT);
+}
+}
+
+out:
+list_for_each_entry_safe(ftp, nftp, _pages, list)
+{
+if ( rc )
+{
+unsigned long s = (unsigned long)pdx_to_page(ftp->idx * 
PDX_GROUP_COUNT);
+
+destroy_xen_mappings(s, s + (1UL << PDX_GROUP_SHIFT));
+free_domheap_pages(ftp->pg, ftp->order);
+}
+list_del(>list);
+xfree(ftp);
+}
+
+if ( !rc )
+set_pdx_range(smfn, emfn);
+
+return rc;
+}
+
+static int __init init_epc_frametable(unsigned long mfn, unsigned long npages)
+{
+return extend_epc_frametable(mfn, mfn + npages);
+}
+
+static int __init init_epc_heap(void)
+{
+struct page_info *pg;
+unsigned long nrpages = total_epc_npages;
+unsigned long i;
+int rc = 0;
+
+rc = init_epc_frametable(epc_base_mfn, nrpages);
+
+if ( rc )
+return rc;
+
+for ( i = 0; i < nrpages; i++ )
+{
+pg = mfn_to_page(epc_base_mfn + i);
+pg->count_info |= PGC_epc;
+}
+
+init_domheap_pages(epc_base_maddr, epc_end_maddr);
+
+return rc;
+}
+
+struct page_info *alloc_epc_page(void)
+{
+struct page_info *pg = alloc_domheap_page(NULL, MEMF_epc);
+
+if ( !pg )
+return NULL;
+
+/*
+ * PGC_epc will be cleared in free_heap_pages(), so we add it back at
+ * allocation time, so that is_epc_page() will return true, when this page
+ * gets freed.
+ */
+pg->count_info |= PGC_epc;
+
+return pg;
+}
+
+void free_epc_page(struct page_info *epg)
+{
+free_domheap_page(epg);
+}
+
+
+static int __init sgx_init_epc(void)
+{
+int rc = 0;
+
+epc_base_vaddr = ioremap_wb(epc_base_maddr,
+total_epc_npages << PAGE_SHIFT);
+
+if ( !epc_base_maddr )
+{
+printk("Failed to ioremap_wb EPC range. Disable SGX.\n");
+
+return 

[Xen-devel] [PATCH v2 05/17] xen: p2m: new 'p2m_epc' type for EPC mapping

2017-12-03 Thread Boqun Feng
From: Kai Huang 

A new 'p2m_epc' type is added for EPC mapping type. Two wrapper functions
set_epc_p2m_entry and clear_epc_p2m_entry are also added for further use.

Signed-off-by: Kai Huang 
---
 xen/arch/x86/mm/p2m-ept.c |  3 +++
 xen/arch/x86/mm/p2m.c | 41 +
 xen/include/asm-x86/p2m.h | 12 ++--
 3 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index b4996ce658ac..34c2e2f8ac1c 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -182,6 +182,9 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, 
ept_entry_t *entry,
 entry->a = !!cpu_has_vmx_ept_ad;
 entry->d = 0;
 break;
+case p2m_epc:
+entry->r = entry->w = entry->x = 1;
+break;
 }
 
 
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index c72a3cdebb81..8eeafe4b250c 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1192,6 +1192,12 @@ int set_identity_p2m_entry(struct domain *d, unsigned 
long gfn_l,
 return ret;
 }
 
+int set_epc_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+return set_typed_p2m_entry(d, gfn, mfn, PAGE_ORDER_4K, p2m_epc,
+p2m_get_hostp2m(d)->default_access);
+}
+
 /*
  * Returns:
  *0for success
@@ -1278,6 +1284,41 @@ int clear_identity_p2m_entry(struct domain *d, unsigned 
long gfn_l)
 return ret;
 }
 
+int clear_epc_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+struct p2m_domain *p2m = p2m_get_hostp2m(d);
+mfn_t omfn;
+p2m_type_t ot;
+p2m_access_t oa;
+int ret = 0;
+
+gfn_lock(p2m, gfn, 0);
+
+omfn = p2m->get_entry(p2m, _gfn(gfn), , , 0, NULL, NULL);
+if ( mfn_eq(omfn, INVALID_MFN) || !p2m_is_epc(ot) )
+{
+printk(XENLOG_G_WARNING
+"d%d: invalid EPC map to clear: gfn 0x%lx, type %d.\n",
+d->domain_id, gfn, ot);
+goto out;
+}
+if ( !mfn_eq(mfn, omfn) )
+{
+printk(XENLOG_G_WARNING
+"d%d: mistaken EPC mfn to clear: gfn 0x%lx, "
+"omfn 0x%lx, mfn 0x%lx.\n",
+d->domain_id, gfn, mfn_x(omfn), mfn_x(mfn));
+}
+
+ret = p2m_set_entry(p2m, _gfn(gfn), INVALID_MFN, PAGE_ORDER_4K, 
p2m_invalid,
+p2m->default_access);
+
+out:
+gfn_unlock(p2m, gfn, 0);
+
+return ret;
+}
+
 /* Returns: 0 for success, -errno for failure */
 int set_shared_p2m_entry(struct domain *d, unsigned long gfn_l, mfn_t mfn)
 {
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index 17b1d0c8d326..40a40dd54380 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -72,6 +72,7 @@ typedef enum {
 p2m_ram_broken = 13,  /* Broken page, access cause domain crash */
 p2m_map_foreign  = 14,/* ram pages from foreign domain */
 p2m_ioreq_server = 15,
+p2m_epc = 16, /* EPC */
 } p2m_type_t;
 
 /* Modifiers to the query */
@@ -142,10 +143,13 @@ typedef unsigned int p2m_query_t;
 | p2m_to_mask(p2m_ram_logdirty) )
 #define P2M_SHARED_TYPES   (p2m_to_mask(p2m_ram_shared))
 
+#define P2M_EPC_TYPES   (p2m_to_mask(p2m_epc))
+
 /* Valid types not necessarily associated with a (valid) MFN. */
 #define P2M_INVALID_MFN_TYPES (P2M_POD_TYPES  \
| p2m_to_mask(p2m_mmio_direct) \
-   | P2M_PAGING_TYPES)
+   | P2M_PAGING_TYPES \
+   | P2M_EPC_TYPES)
 
 /* Broken type: the frame backing this pfn has failed in hardware
  * and must not be touched. */
@@ -153,6 +157,7 @@ typedef unsigned int p2m_query_t;
 
 /* Useful predicates */
 #define p2m_is_ram(_t) (p2m_to_mask(_t) & P2M_RAM_TYPES)
+#define p2m_is_epc(_t) (p2m_to_mask(_t) & P2M_EPC_TYPES)
 #define p2m_is_hole(_t) (p2m_to_mask(_t) & P2M_HOLE_TYPES)
 #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES)
 #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
@@ -163,7 +168,7 @@ typedef unsigned int p2m_query_t;
 /* Grant types are *not* considered valid, because they can be
unmapped at any time and, unless you happen to be the shadow or p2m
implementations, there's no way of synchronising against that. */
-#define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES))
+#define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES | 
P2M_EPC_TYPES))
 #define p2m_has_emt(_t)  (p2m_to_mask(_t) & (P2M_RAM_TYPES | 
p2m_to_mask(p2m_mmio_direct)))
 #define p2m_is_pageable(_t) (p2m_to_mask(_t) & P2M_PAGEABLE_TYPES)
 #define p2m_is_paging(_t)   (p2m_to_mask(_t) & P2M_PAGING_TYPES)
@@ -635,6 +640,9 @@ int clear_identity_p2m_entry(struct domain *d, unsigned 
long gfn);
 int p2m_add_foreign(struct domain *tdom, unsigned long 

[Xen-devel] [RFC PATCH v2 00/17] RFC: SGX Virtualization design and draft patches

2017-12-03 Thread Boqun Feng
om/sites/default/files/managed/7c/f1/332831-sdm-vol-3d.pdf

- Paper: Intel SGX Explained
https://eprint.iacr.org/2016/086.pdf

- ISCA 2015 tutorial slides for Intel® SGX - Intel® Software
https://software.intel.com/sites/default/files/332680-002.pdf

Boqun Feng (5):
  xen: mm: introduce non-scrubbable pages
  xen: mm: manage EPC pages in Xen heaps
  xen: x86/mm: add SGX EPC management
  xen: x86: add functions to populate and destroy EPC for domain
  xen: tools: add SGX to applying MSR policy

Kai Huang (12):
  xen: x86: expose SGX to HVM domain in CPU featureset
  xen: x86: add early stage SGX feature detection
  xen: vmx: detect ENCLS VMEXIT
  xen: x86/mm: introduce ioremap_wb()
  xen: p2m: new 'p2m_epc' type for EPC mapping
  xen: x86: add SGX cpuid handling support.
  xen: vmx: handle SGX related MSRs
  xen: vmx: handle ENCLS VMEXIT
  xen: vmx: handle VMEXIT from SGX enclave
  xen: x86: reset EPC when guest got suspended.
  xen: tools: add new 'sgx' parameter support
  xen: tools: add SGX to applying CPUID policy

 docs/misc/xen-command-line.markdown |   8 +
 tools/libxc/Makefile|   1 +
 tools/libxc/include/xc_dom.h|   4 +
 tools/libxc/include/xenctrl.h   |  16 +
 tools/libxc/xc_cpuid_x86.c  |  68 ++-
 tools/libxc/xc_msr_x86.h|  10 +
 tools/libxc/xc_sgx.c|  82 +++
 tools/libxl/libxl.h |   3 +-
 tools/libxl/libxl_cpuid.c   |  15 +-
 tools/libxl/libxl_create.c  |  10 +
 tools/libxl/libxl_dom.c |  65 ++-
 tools/libxl/libxl_internal.h|   2 +
 tools/libxl/libxl_nocpuid.c |   4 +-
 tools/libxl/libxl_types.idl |  11 +
 tools/libxl/libxl_x86.c |  12 +
 tools/ocaml/libs/xc/xenctrl_stubs.c |  11 +-
 tools/python/xen/lowlevel/xc/xc.c   |  11 +-
 tools/xl/xl_parse.c |  86 +++
 tools/xl/xl_parse.h |   1 +
 xen/arch/x86/Makefile   |   1 +
 xen/arch/x86/cpu/common.c   |  15 +
 xen/arch/x86/cpuid.c|  62 ++-
 xen/arch/x86/domctl.c   |  87 ++-
 xen/arch/x86/hvm/hvm.c  |   3 +
 xen/arch/x86/hvm/vmx/vmcs.c |  16 +-
 xen/arch/x86/hvm/vmx/vmx.c  |  68 +++
 xen/arch/x86/hvm/vmx/vvmx.c |  11 +
 xen/arch/x86/mm.c   |   9 +-
 xen/arch/x86/mm/p2m-ept.c   |   3 +
 xen/arch/x86/mm/p2m.c   |  41 ++
 xen/arch/x86/msr.c  |   6 +-
 xen/arch/x86/sgx.c  | 815 
 xen/common/page_alloc.c |  39 +-
 xen/include/asm-arm/mm.h|   9 +
 xen/include/asm-x86/cpufeature.h|   4 +
 xen/include/asm-x86/cpuid.h |  29 +-
 xen/include/asm-x86/hvm/hvm.h   |   3 +
 xen/include/asm-x86/hvm/vmx/vmcs.h  |   8 +
 xen/include/asm-x86/hvm/vmx/vmx.h   |   3 +
 xen/include/asm-x86/mm.h|  19 +-
 xen/include/asm-x86/msr-index.h |   6 +
 xen/include/asm-x86/msr.h   |   5 +
 xen/include/asm-x86/p2m.h   |  12 +-
 xen/include/asm-x86/sgx.h   |  86 +++
 xen/include/public/arch-x86/cpufeatureset.h |   3 +-
 xen/include/xen/mm.h|   2 +
 xen/tools/gen-cpuid.py  |   3 +
 47 files changed, 1757 insertions(+), 31 deletions(-)
 create mode 100644 tools/libxc/xc_sgx.c
 create mode 100644 xen/arch/x86/sgx.c
 create mode 100644 xen/include/asm-x86/sgx.h

-- 
2.15.0


___
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v2 09/17] xen: x86: add functions to populate and destroy EPC for domain

2017-12-03 Thread Boqun Feng
Add per-domain structure to store SGX per-domain info. Currently only domain's
EPC base and size are stored. Also add new functions for further use:
- domain_populate_epc  # populate EPC when EPC base & size are notified.
- domain_reset_epc # Reset domain's EPC to be invalid. Used when domain
  goes to S3-S5, or being destroyed.
- domain_destroy_epc   # destroy and free domain's EPC.

For now, those functions only work for HVM domain, and will return
-EFAULT if calling these for non-HVM domain.

Signed-off-by: Kai Huang <kai.hu...@linux.intel.com>
Signed-off-by: Boqun Feng <boqun.f...@intel.com>
---
 xen/arch/x86/hvm/vmx/vmx.c |   3 +
 xen/arch/x86/sgx.c | 340 +
 xen/include/asm-x86/hvm/vmx/vmcs.h |   2 +
 xen/include/asm-x86/sgx.h  |  13 ++
 4 files changed, 358 insertions(+)

diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index b18cceab55b2..92fb85b13a0c 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -417,6 +417,9 @@ static int vmx_domain_initialise(struct domain *d)
 
 static void vmx_domain_destroy(struct domain *d)
 {
+if ( domain_epc_populated(d) )
+domain_destroy_epc(d);
+
 if ( !has_vlapic(d) )
 return;
 
diff --git a/xen/arch/x86/sgx.c b/xen/arch/x86/sgx.c
index 9409b041e4f7..0c898c3086cb 100644
--- a/xen/arch/x86/sgx.c
+++ b/xen/arch/x86/sgx.c
@@ -25,6 +25,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 struct sgx_cpuinfo __read_mostly boot_sgx_cpudata;
 
@@ -38,6 +40,344 @@ boolean_param("sgx", opt_sgx_enabled);
 
 static void *epc_base_vaddr = NULL;
 
+static void *map_epc_page_to_xen(struct page_info *pg)
+{
+BUG_ON(!epc_base_vaddr);
+
+return (void *)((unsigned long)epc_base_vaddr +
+((page_to_mfn(pg) - epc_base_mfn) << PAGE_SHIFT));
+}
+
+/* ENCLS opcode */
+#define ENCLS   .byte 0x0f, 0x01, 0xcf
+
+/*
+ * ENCLS leaf functions
+ *
+ * However currently we only needs EREMOVE..
+ */
+enum {
+ECREATE = 0x0,
+EADD= 0x1,
+EINIT   = 0x2,
+EREMOVE = 0x3,
+EDGBRD  = 0x4,
+EDGBWR  = 0x5,
+EEXTEND = 0x6,
+ELDU= 0x8,
+EBLOCK  = 0x9,
+EPA = 0xA,
+EWB = 0xB,
+ETRACK  = 0xC,
+EAUG= 0xD,
+EMODPR  = 0xE,
+EMODT   = 0xF,
+};
+
+/*
+ * ENCLS error code
+ *
+ * Currently we only need SGX_CHILD_PRESENT
+ */
+#define SGX_CHILD_PRESENT   13
+
+static inline int __encls(unsigned long rax, unsigned long rbx,
+  unsigned long rcx, unsigned long rdx)
+{
+int ret;
+
+asm volatile ( "ENCLS;\n\t"
+: "=a" (ret)
+: "a" (rax), "b" (rbx), "c" (rcx), "d" (rdx)
+: "memory", "cc");
+
+return ret;
+}
+
+static inline int __eremove(void *epc)
+{
+unsigned long rbx = 0, rdx = 0;
+
+return __encls(EREMOVE, rbx, (unsigned long)epc, rdx);
+}
+
+static int sgx_eremove(struct page_info *epg)
+{
+void *addr = map_epc_page_to_xen(epg);
+int ret;
+
+BUG_ON(!addr);
+
+ret =  __eremove(addr);
+
+return ret;
+}
+
+struct sgx_domain *to_sgx(struct domain *d)
+{
+if (!is_hvm_domain(d))
+return NULL;
+else
+return >arch.hvm_domain.vmx.sgx;
+}
+
+bool domain_epc_populated(struct domain *d)
+{
+BUG_ON(!to_sgx(d));
+
+return !!to_sgx(d)->epc_base_pfn;
+}
+
+/*
+ * Reset domain's EPC with EREMOVE. free_epc indicates whether to free EPC
+ * pages during reset. This will be called when domain goes into S3-S5 state
+ * (with free_epc being false), and when domain is destroyed (with free_epc
+ * being true).
+ *
+ * It is possible that EREMOVE will be called for SECS when it still has
+ * children present, in which case SGX_CHILD_PRESENT will be returned. In this
+ * case, SECS page is kept to a tmp list and after all EPC pages have been
+ * called with EREMOVE, we call EREMOVE for all the SECS pages again, and this
+ * time SGX_CHILD_PRESENT should never occur as all children should have been
+ * removed.
+ *
+ * If unexpected error returned by EREMOVE, it means the EPC page becomes
+ * abnormal, so it will not be freed even free_epc is true, as further use of
+ * this EPC can cause unexpected error, potentially damaging other domains.
+ */
+static int __domain_reset_epc(struct domain *d, unsigned long epc_base_pfn,
+unsigned long epc_npages, bool free_epc)
+{
+struct page_list_head secs_list;
+struct page_info *epg, *tmp;
+unsigned long i;
+int ret = 0;
+
+INIT_PAGE_LIST_HEAD(_list);
+
+for ( i = 0; i < epc_npages; i++ )
+{
+unsigned long gfn;
+mfn_t mfn;
+p2m_type_t t;
+int r;
+
+gfn = i + epc_base_pfn;
+mfn = get_gfn_query(d, gfn, );
+if ( unlikely(mfn_eq(mfn, INVALID_MFN)) )
+{
+printk(&