date:20151029

The following changes since commit 60417fcc2b0235dfe3dcd589c56dbe3ea1a64c54:

  KVM: s390: factor out reading of the guest TOD clock (2015-10-13 15:50:35 
+0200)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git  
tags/kvm-s390-next-20151028

for you to fetch changes up to 46b708ea875f14f5496109df053624199f3aae87:

  KVM: s390: use simple switch statement as multiplexer (2015-10-29 15:59:11 
+0100)


KVM: s390: Bugfix and cleanups

There is one important bug fix for a potential memory corruption
and/or guest errors for guests with 63 or 64 vCPUs. This fix would
qualify for 4.3 but is some days too late giving that we are
about to release 4.3.
Given that this patch is cc stable >= 3.15 anyway, we can handle
it via 4.4. merge window.

This pull reuqest also contains two cleanups.


Christian Borntraeger (2):
  KVM: s390: drop useless newline in debugging data
  KVM: s390: use simple switch statement as multiplexer

David Hildenbrand (1):
  KVM: s390: SCA must not cross page boundaries

 arch/s390/kvm/intercept.c | 42 +-
 arch/s390/kvm/kvm-s390.c  | 12 +++-
 2 files changed, 28 insertions(+), 26 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[GIT PULL 3/3] KVM: s390: use simple switch statement as multiplexer

We currently do some magic shifting (by exploiting that exit codes
are always a multiple of 4) and a table lookup to jump into the
exit handlers. This causes some calculations and checks, just to
do an potentially expensive function call.

Changing that to a switch statement gives the compiler the chance
to inline and dynamically decide between jump tables or inline
compare and branches. In addition it makes the code more readable.

bloat-o-meter gives me a small reduction in code size:

add/remove: 0/7 grow/shrink: 1/1 up/down: 986/-1334 (-348)
function old new   delta
kvm_handle_sie_intercept  721058+986
handle_prog  704 696  -8
handle_noop   54   - -54
handle_partial_execution  60   - -60
intercept_funcs  120   --120
handle_instruction   198   --198
handle_validity  210   --210
handle_stop  316   --316
handle_external_interrupt368   --368

Right now my gcc does conditional branches instead of jump tables.
The inlining seems to give us enough cycles as some micro-benchmarking
shows minimal improvements, but still in noise.

Signed-off-by: Christian Borntraeger 
Reviewed-by: Cornelia Huck 
---
 arch/s390/kvm/intercept.c | 42 +-
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 7365e8a..b4a5aa1 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -336,28 +336,28 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
return -EOPNOTSUPP;
 }
 
-static const intercept_handler_t intercept_funcs[] = {
-   [0x00 >> 2] = handle_noop,
-   [0x04 >> 2] = handle_instruction,
-   [0x08 >> 2] = handle_prog,
-   [0x10 >> 2] = handle_noop,
-   [0x14 >> 2] = handle_external_interrupt,
-   [0x18 >> 2] = handle_noop,
-   [0x1C >> 2] = kvm_s390_handle_wait,
-   [0x20 >> 2] = handle_validity,
-   [0x28 >> 2] = handle_stop,
-   [0x38 >> 2] = handle_partial_execution,
-};
-
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 {
-   intercept_handler_t func;
-   u8 code = vcpu->arch.sie_block->icptcode;
-
-   if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs))
+   switch (vcpu->arch.sie_block->icptcode) {
+   case 0x00:
+   case 0x10:
+   case 0x18:
+   return handle_noop(vcpu);
+   case 0x04:
+   return handle_instruction(vcpu);
+   case 0x08:
+   return handle_prog(vcpu);
+   case 0x14:
+   return handle_external_interrupt(vcpu);
+   case 0x1c:
+   return kvm_s390_handle_wait(vcpu);
+   case 0x20:
+   return handle_validity(vcpu);
+   case 0x28:
+   return handle_stop(vcpu);
+   case 0x38:
+   return handle_partial_execution(vcpu);
+   default:
return -EOPNOTSUPP;
-   func = intercept_funcs[code >> 2];
-   if (func)
-   return func(vcpu);
-   return -EOPNOTSUPP;
+   }
 }
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[GIT PULL 2/3] KVM: s390: drop useless newline in debugging data

the s390 debug feature does not need newlines. In fact it will
result in empty lines. Get rid of 4 leftovers.

Signed-off-by: Christian Borntraeger 
Acked-by: Cornelia Huck 
---
 arch/s390/kvm/kvm-s390.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3559617..07a6aa8 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -514,7 +514,7 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct 
kvm_device_attr *attr)
 
if (gtod_high != 0)
return -EINVAL;
-   VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x\n", gtod_high);
+   VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x", gtod_high);
 
return 0;
 }
@@ -527,7 +527,7 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct 
kvm_device_attr *attr)
return -EFAULT;
 
kvm_s390_set_tod_clock(kvm, gtod);
-   VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx\n", gtod);
+   VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod);
return 0;
 }
 
@@ -559,7 +559,7 @@ static int kvm_s390_get_tod_high(struct kvm *kvm, struct 
kvm_device_attr *attr)
if (copy_to_user((void __user *)attr->addr, _high,
 sizeof(gtod_high)))
return -EFAULT;
-   VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x\n", gtod_high);
+   VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x", gtod_high);
 
return 0;
 }
@@ -571,7 +571,7 @@ static int kvm_s390_get_tod_low(struct kvm *kvm, struct 
kvm_device_attr *attr)
gtod = kvm_s390_get_tod_clock_fast(kvm);
if (copy_to_user((void __user *)attr->addr, , sizeof(gtod)))
return -EFAULT;
-   VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx\n", gtod);
+   VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx", gtod);
 
return 0;
 }
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[GIT PULL 1/3] KVM: s390: SCA must not cross page boundaries

From: David Hildenbrand 

We seemed to have missed a few corner cases in commit f6c137ff00a4
("KVM: s390: randomize sca address").

The SCA has a maximum size of 2112 bytes. By setting the sca_offset to
some unlucky numbers, we exceed the page.

0x7c0 (1984) -> Fits exactly
0x7d0 (2000) -> 16 bytes out
0x7e0 (2016) -> 32 bytes out
0x7f0 (2032) -> 48 bytes out

One VCPU entry is 32 bytes long.

For the last two cases, we actually write data to the other page.
1. The address of the VCPU.
2. Injection/delivery/clearing of SIGP externall calls via SIGP IF.

Especially the 2. happens regularly. So this could produce two problems:
1. The guest losing/getting external calls.
2. Random memory overwrites in the host.

So this problem happens on every 127 + 128 created VM with 64 VCPUs.

Cc: sta...@vger.kernel.org # v3.15+
Acked-by: Christian Borntraeger 
Signed-off-by: David Hildenbrand 
Signed-off-by: Christian Borntraeger 
---
 arch/s390/kvm/kvm-s390.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 618c854..3559617 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1098,7 +1098,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (!kvm->arch.sca)
goto out_err;
spin_lock(_lock);
-   sca_offset = (sca_offset + 16) & 0x7f0;
+   sca_offset += 16;
+   if (sca_offset + sizeof(struct sca_block) > PAGE_SIZE)
+   sca_offset = 0;
kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + 
sca_offset);
spin_unlock(_lock);
 
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] vfio/platform: store mapped memory in region, instead of an on-stack copy

2015-10-29 Thread James Morse

vfio_platform_{read,write}_mmio() call ioremap_nocache() to map
a region of io memory, which they store in struct vfio_platform_region to
be eventually re-used, or unmapped by vfio_platform_regions_cleanup().

These functions receive a copy of their struct vfio_platform_region
argument on the stack - so these mapped areas are always allocated, and
always leaked.

Pass this argument as a pointer instead.

Fixes: 6e3f26456009 "vfio/platform: read and write support for the device fd"
Signed-off-by: James Morse 
---
 drivers/vfio/platform/vfio_platform_common.c | 36 ++--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
index f3b6299..ccf5da5 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -308,17 +308,17 @@ static long vfio_platform_ioctl(void *device_data,
return -ENOTTY;
 }
 
-static ssize_t vfio_platform_read_mmio(struct vfio_platform_region reg,
+static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg,
   char __user *buf, size_t count,
   loff_t off)
 {
unsigned int done = 0;
 
-   if (!reg.ioaddr) {
-   reg.ioaddr =
-   ioremap_nocache(reg.addr, reg.size);
+   if (!reg->ioaddr) {
+   reg->ioaddr =
+   ioremap_nocache(reg->addr, reg->size);
 
-   if (!reg.ioaddr)
+   if (!reg->ioaddr)
return -ENOMEM;
}
 
@@ -328,7 +328,7 @@ static ssize_t vfio_platform_read_mmio(struct 
vfio_platform_region reg,
if (count >= 4 && !(off % 4)) {
u32 val;
 
-   val = ioread32(reg.ioaddr + off);
+   val = ioread32(reg->ioaddr + off);
if (copy_to_user(buf, , 4))
goto err;
 
@@ -336,7 +336,7 @@ static ssize_t vfio_platform_read_mmio(struct 
vfio_platform_region reg,
} else if (count >= 2 && !(off % 2)) {
u16 val;
 
-   val = ioread16(reg.ioaddr + off);
+   val = ioread16(reg->ioaddr + off);
if (copy_to_user(buf, , 2))
goto err;
 
@@ -344,7 +344,7 @@ static ssize_t vfio_platform_read_mmio(struct 
vfio_platform_region reg,
} else {
u8 val;
 
-   val = ioread8(reg.ioaddr + off);
+   val = ioread8(reg->ioaddr + off);
if (copy_to_user(buf, , 1))
goto err;
 
@@ -377,7 +377,7 @@ static ssize_t vfio_platform_read(void *device_data, char 
__user *buf,
return -EINVAL;
 
if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO)
-   return vfio_platform_read_mmio(vdev->regions[index],
+   return vfio_platform_read_mmio(>regions[index],
buf, count, off);
else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO)
return -EINVAL; /* not implemented */
@@ -385,17 +385,17 @@ static ssize_t vfio_platform_read(void *device_data, char 
__user *buf,
return -EINVAL;
 }
 
-static ssize_t vfio_platform_write_mmio(struct vfio_platform_region reg,
+static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg,
const char __user *buf, size_t count,
loff_t off)
 {
unsigned int done = 0;
 
-   if (!reg.ioaddr) {
-   reg.ioaddr =
-   ioremap_nocache(reg.addr, reg.size);
+   if (!reg->ioaddr) {
+   reg->ioaddr =
+   ioremap_nocache(reg->addr, reg->size);
 
-   if (!reg.ioaddr)
+   if (!reg->ioaddr)
return -ENOMEM;
}
 
@@ -407,7 +407,7 @@ static ssize_t vfio_platform_write_mmio(struct 
vfio_platform_region reg,
 
if (copy_from_user(, buf, 4))
goto err;
-   iowrite32(val, reg.ioaddr + off);
+   iowrite32(val, reg->ioaddr + off);
 
filled = 4;
} else if (count >= 2 && !(off % 2)) {
@@ -415,7 +415,7 @@ static ssize_t vfio_platform_write_mmio(struct 
vfio_platform_region reg,
 
if (copy_from_user(, buf, 2))
goto err;
-   iowrite16(val, reg.ioaddr + off);
+   iowrite16(val, reg->ioaddr + off);
 
filled = 2;
} else {
@@ -423,7 +423,7 @@ static ssize_t vfio_platform_write_mmio(struct

[PATCH v2] vfio/type1: handle case where IOMMU does not support PAGE_SIZE size

2015-10-29 Thread Eric Auger

Current vfio_pgsize_bitmap code hides the supported IOMMU page
sizes smaller than PAGE_SIZE. As a result, in case the IOMMU
does not support PAGE_SIZE page, the alignment check on map/unmap
is done with larger page sizes, if any. This can fail although
mapping could be done with pages smaller than PAGE_SIZE.

This patch modifies vfio_pgsize_bitmap implementation so that,
in case the IOMMU supports page sizes smaller than PAGE_SIZE
we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
That way the user will be able to map/unmap buffers whose size/
start address is aligned with PAGE_SIZE. Pinning code uses that
granularity while iommu driver can use the sub-PAGE_SIZE size
to map the buffer.

Signed-off-by: Eric Auger 
Signed-off-by: Alex Williamson 
Acked-by: Will Deacon 

---

This was tested on AMD Seattle with 64kB page host. ARM MMU 401
currently expose 4kB, 2MB and 1GB page support. With a 64kB page host,
the map/unmap check is done against 2MB. Some alignment check fail
so VFIO_IOMMU_MAP_DMA fail while we could map using 4kB IOMMU page
size.

v1 -> v2:
- correct PAGE_HOST type in comment and commit msg
- Add Will's R-b

RFC -> PATCH v1:
- move all modifications in vfio_pgsize_bitmap following Alex'
  suggestion to expose a fake PAGE_SIZE support
- restore WARN_ON's
---
 drivers/vfio/vfio_iommu_type1.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 57d8c37..59d47cb 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -403,13 +403,26 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, 
struct vfio_dma *dma)
 static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
 {
struct vfio_domain *domain;
-   unsigned long bitmap = PAGE_MASK;
+   unsigned long bitmap = ULONG_MAX;
 
mutex_lock(>lock);
list_for_each_entry(domain, >domain_list, next)
bitmap &= domain->domain->ops->pgsize_bitmap;
mutex_unlock(>lock);
 
+   /*
+* In case the IOMMU supports page sizes smaller than PAGE_SIZE
+* we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
+* That way the user will be able to map/unmap buffers whose size/
+* start address is aligned with PAGE_SIZE. Pinning code uses that
+* granularity while iommu driver can use the sub-PAGE_SIZE size
+* to map the buffer.
+*/
+   if (bitmap & ~PAGE_MASK) {
+   bitmap &= PAGE_MASK;
+   bitmap |= PAGE_SIZE;
+   }
+
return bitmap;
 }
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH] vfio/type1: Do not support IOMMUs that allow bypass

2015-10-29 Thread Will Deacon

On Tue, Oct 27, 2015 at 10:00:11AM -0600, Alex Williamson wrote:
> On Tue, 2015-10-27 at 15:40 +, Will Deacon wrote:
> > On Fri, Oct 16, 2015 at 09:51:22AM -0600, Alex Williamson wrote:
> > > Would it be possible to add iommu_domain_geometry support to arm-smmu.c?
> > > In addition to this test to verify that DMA cannot bypass the IOMMU, I'd
> > > eventually like to pass the aperture information out through the VFIO
> > > API.  Thanks,
> > 
> > The slight snag here is that we don't know which SMMU in the system the
> > domain is attached to at the point when the geometry is queried, so I
> > can't give you an upper bound on the aperture. For example, if there is
> > an SMMU with a 32-bit input address and another with a 48-bit input
> > address.
> > 
> > We could play the same horrible games that we do with the pgsize bitmap,
> > and truncate some global aperture everytime we probe an SMMU device, but
> > I'd really like to have fewer hacks like that if possible. The root of
> > the problem is still that domains are allocated for a bus, rather than
> > an IOMMU instance.
> 
> Yes, Intel VT-d has this issue as well.  In theory we can have
> heterogeneous IOMMU hardware units (DRHDs) in a system and the upper
> bound of the geometry could be diminished if we add a less capable DRHD
> into the domain.  I suspect this is more a theoretical problem than a
> practical one though as we're typically mixing similar DRHDs and I think
> we're still capable of 39-bit addressing in the least capable version
> per the spec.
> 
> In any case, I really want to start testing geometry.force_aperture,
> even if we're not yet comfortable to expose the IOMMU limits to the
> user.  The vfio type1 shouldn't be enabled at all for underlying
> hardware that allows DMA bypass.  Thanks,

Ok, I'll put it on my list of things to look at under the assumption that
the actual aperture limits don't need to be accurate as long as DMA to
an arbitrary unmapped address always faults.

Will
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH] vfio/type1: Do not support IOMMUs that allow bypass

2015-10-29 Thread Robin Murphy

On 29/10/15 18:28, Will Deacon wrote:

On Tue, Oct 27, 2015 at 10:00:11AM -0600, Alex Williamson wrote:

On Tue, 2015-10-27 at 15:40 +, Will Deacon wrote:

On Fri, Oct 16, 2015 at 09:51:22AM -0600, Alex Williamson wrote:

Would it be possible to add iommu_domain_geometry support to arm-smmu.c?
In addition to this test to verify that DMA cannot bypass the IOMMU, I'd
eventually like to pass the aperture information out through the VFIO
API. Thanks,

The slight snag here is that we don't know which SMMU in the system the
domain is attached to at the point when the geometry is queried, so I
can't give you an upper bound on the aperture. For example, if there is
an SMMU with a 32-bit input address and another with a 48-bit input
address.

We could play the same horrible games that we do with the pgsize bitmap,
and truncate some global aperture everytime we probe an SMMU device, but
I'd really like to have fewer hacks like that if possible. The root of
the problem is still that domains are allocated for a bus, rather than
an IOMMU instance.

Yes, Intel VT-d has this issue as well. In theory we can have
heterogeneous IOMMU hardware units (DRHDs) in a system and the upper
bound of the geometry could be diminished if we add a less capable DRHD
into the domain. I suspect this is more a theoretical problem than a
practical one though as we're typically mixing similar DRHDs and I think
we're still capable of 39-bit addressing in the least capable version
per the spec.

In any case, I really want to start testing geometry.force_aperture,
even if we're not yet comfortable to expose the IOMMU limits to the
user. The vfio type1 shouldn't be enabled at all for underlying
hardware that allows DMA bypass. Thanks,

Ok, I'll put it on my list of things to look at under the assumption that
the actual aperture limits don't need to be accurate as long as DMA to
an arbitrary unmapped address always faults.

I'm pretty sure we'd only ever set the aperture to the full input
address range anyway (since we're not a GART-type thing), in which case
we should only need to worry about unmatched streams that don't hit in a
domain at all. Doesn't the disable_bypass option already cover that?
(FWIW I hacked it up for v2 a while back, too[0]).

Robin.

[0]:http://www.linux-arm.org/git?p=linux-rm.git;a=commitdiff;h=23a251189fa3330b799a837bd8eb1023aa2dcea4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Re: [PATCH v1 2/2] dma-mapping-common: add DMA attribute - DMA_ATTR_IOMMU_BYPASS

On Oct 28, 2015 6:11 PM, "Benjamin Herrenschmidt"
 wrote:
>
> On Thu, 2015-10-29 at 09:42 +0900, David Woodhouse wrote:
> > On Thu, 2015-10-29 at 09:32 +0900, Benjamin Herrenschmidt wrote:
> >
> > > On Power, I generally have 2 IOMMU windows for a device, one at the
> > > bottom is remapped, and is generally used for 32-bit devices and the
> > > one at the top us setup as a bypass
> >
> > So in the normal case of decent 64-bit devices (and not in a VM),
> > they'll *already* be using the bypass region and have full access to
> > all of memory, all of the time? And you have no protection against
> > driver and firmware bugs causing stray DMA?
>
> Correct, we chose to do that for performance reasons.

Could this be mitigated using pools?  I don't know if the net code
would play along easily.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH] vfio/type1: Do not support IOMMUs that allow bypass

2015-10-29 Thread Will Deacon

On Thu, Oct 29, 2015 at 06:42:10PM +, Robin Murphy wrote:
> On 29/10/15 18:28, Will Deacon wrote:
> >On Tue, Oct 27, 2015 at 10:00:11AM -0600, Alex Williamson wrote:
> >>On Tue, 2015-10-27 at 15:40 +, Will Deacon wrote:
> >>>On Fri, Oct 16, 2015 at 09:51:22AM -0600, Alex Williamson wrote:
> Would it be possible to add iommu_domain_geometry support to arm-smmu.c?
> In addition to this test to verify that DMA cannot bypass the IOMMU, I'd
> eventually like to pass the aperture information out through the VFIO
> API.  Thanks,
> >>>
> >>>The slight snag here is that we don't know which SMMU in the system the
> >>>domain is attached to at the point when the geometry is queried, so I
> >>>can't give you an upper bound on the aperture. For example, if there is
> >>>an SMMU with a 32-bit input address and another with a 48-bit input
> >>>address.
> >>>
> >>>We could play the same horrible games that we do with the pgsize bitmap,
> >>>and truncate some global aperture everytime we probe an SMMU device, but
> >>>I'd really like to have fewer hacks like that if possible. The root of
> >>>the problem is still that domains are allocated for a bus, rather than
> >>>an IOMMU instance.
> >>
> >>Yes, Intel VT-d has this issue as well.  In theory we can have
> >>heterogeneous IOMMU hardware units (DRHDs) in a system and the upper
> >>bound of the geometry could be diminished if we add a less capable DRHD
> >>into the domain.  I suspect this is more a theoretical problem than a
> >>practical one though as we're typically mixing similar DRHDs and I think
> >>we're still capable of 39-bit addressing in the least capable version
> >>per the spec.
> >>
> >>In any case, I really want to start testing geometry.force_aperture,
> >>even if we're not yet comfortable to expose the IOMMU limits to the
> >>user.  The vfio type1 shouldn't be enabled at all for underlying
> >>hardware that allows DMA bypass.  Thanks,
> >
> >Ok, I'll put it on my list of things to look at under the assumption that
> >the actual aperture limits don't need to be accurate as long as DMA to
> >an arbitrary unmapped address always faults.
> 
> I'm pretty sure we'd only ever set the aperture to the full input address
> range anyway (since we're not a GART-type thing), in which case we should
> only need to worry about unmatched streams that don't hit in a domain at
> all. Doesn't the disable_bypass option already cover that? (FWIW I hacked it
> up for v2 a while back, too[0]).

Well, the "full input address range" is tricky when you have multiple SMMU
instances with different input address sizes. I can do something similar
to the pgsize_bitmap.

I also don't think the disable_bypass option is what we're after -- this
is about devices attached to a VFIO domain that can still mysteriously
bypass the SMMU for some ranges AFAIU (and shouldn't be an issue for ARM).

Will
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v1 2/2] dma-mapping-common: add DMA attribute - DMA_ATTR_IOMMU_BYPASS

2015-10-29 Thread David Woodhouse

On Thu, 2015-10-29 at 11:31 -0700, Andy Lutomirski wrote:
> On Oct 28, 2015 6:11 PM, "Benjamin Herrenschmidt"
>  wrote:
> > 
> > On Thu, 2015-10-29 at 09:42 +0900, David Woodhouse wrote:
> > > On Thu, 2015-10-29 at 09:32 +0900, Benjamin Herrenschmidt wrote:
> > > 
> > > > On Power, I generally have 2 IOMMU windows for a device, one at
> > > > the
> > > > bottom is remapped, and is generally used for 32-bit devices
> > > > and the
> > > > one at the top us setup as a bypass
> > > 
> > > So in the normal case of decent 64-bit devices (and not in a VM),
> > > they'll *already* be using the bypass region and have full access
> > > to
> > > all of memory, all of the time? And you have no protection
> > > against
> > > driver and firmware bugs causing stray DMA?
> > 
> > Correct, we chose to do that for performance reasons.
> 
> Could this be mitigated using pools?  I don't know if the net code
> would play along easily.

For the receive side, it shouldn't be beyond the wit of man to
introduce an API which allocates *and* DMA-maps a skb. Pass it to
netif_rx() still mapped, with a destructor that just shoves it back in
a pool for re-use.

Doing it for transmit might be a little more complex, but perhaps still
possible.

-- 
dwmw2



smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH/RFC 0/4] dma ops and virtio

On Wed, Oct 28, 2015 at 5:04 PM, Christian Borntraeger
 wrote:
> Am 29.10.2015 um 07:22 schrieb Andy Lutomirski:
>> On Tue, Oct 27, 2015 at 3:48 PM, Christian Borntraeger
>>  wrote:
>>> This is an RFC to check if I am on the right track.  There
>>> are some attempts to unify the dma ops (Christoph) as well
>>> as some attempts to make virtio use the dma API (Andy).
>>>
>>> At todays discussion at the kernel summit, we concluded that
>>> we want to use the same code on all platforms, whereever
>>> possible, so having a dummy dma_op might be the easiest
>>> solution to keep virtio-ccw as similar as possible to
>>> virtio-pci. Andy Lutomirski will rework his patchset to
>>> unconditionally use the dma ops.  We will also need a
>>> compatibility quirk for powerpc to bypass the iommu mappings
>>> on older QEMU version (which translates to all versions as
>>> of today) and per device, e.g.  via device tree.  Ben
>>> Herrenschmidt will look into that.
>>
>> The combination of these patches plus my series don't link for me
>> unless I enable PCI.  Would s390 need to select HAS_DMA from VIRTIO or
>> something similar?
>
> Well, actually this is a potential improvement for series. I could just
> make the noop dma ops default for _all_ devices unless it has a per
> device dma_ops (e.g. s390 pci) and the unconditionally set HAS_DMA.
>>
>> Also, is it possible to use QEMU to emulate an s390x?  Even just:
>>
>> qemu-system-s390x -M s390-ccw-virtio
>
> Yes, we have no interactive bios and if no boot device is there is bios
> will load a disabled wait, which will exit qemu.
>
> Make sure to compile your kernel for z900  (processor type) as qemu does not
> handle all things of newer processors.
> You can then do
> qemu-system-s390x -nographic -m 256 -kernel vmlinux -initrd 
>

Progress!  After getting that sort-of-working, I figured out what was
wrong with my earlier command, and I got that working, too.  Now I
get:

qemu-system-s390x -fsdev
local,id=virtfs1,path=/,security_model=none,readonly -device
virtio-9p-ccw,fsdev=virtfs1,mount_tag=/dev/root -M s390-ccw-virtio
-nodefaults -device sclpconsole,chardev=console -parallel none -net
none -echr 1 -serial none -chardev stdio,id=console,signal=off,mux=on
-serial chardev:console -mon chardev=console -vga none -display none
-kernel arch/s390/boot/bzImage -append
'init=/home/luto/devel/virtme/virtme/guest/virtme-init
psmouse.proto=exps "virtme_stty_con=rows 24 cols 150 iutf8"
TERM=xterm-256color rootfstype=9p
rootflags=ro,version=9p2000.L,trans=virtio,access=any
raid=noautodetect debug'
Initializing cgroup subsys cpuset
Initializing cgroup subsys cpu
Initializing cgroup subsys cpuacct
Linux version 4.3.0-rc7-00403-ga2b5cd810259-dirty
(l...@amaluto.corp.amacapital.net) (gcc version 5.1.1 20150618 (Red
Hat Cross 5.1.1-3) (GCC) ) #328 SMP Thu Oct 29 15:46:05 PDT 2015
setup: Linux is running under KVM in 64-bit mode
setup: Max memory size: 128MB
Zone ranges:
  DMA  [mem 0x-0x7fff]
  Normal   empty
Movable zone start for each node
Early memory node ranges
  node   0: [mem 0x-0x07ff]
Initmem setup node 0 [mem 0x-0x07ff]
On node 0 totalpages: 32768
  DMA zone: 512 pages used for memmap
  DMA zone: 0 pages reserved
  DMA zone: 32768 pages, LIFO batch:7
PERCPU: Embedded 466 pages/cpu @07605000 s1868032 r8192 d32512 u1908736
pcpu-alloc: s1868032 r8192 d32512 u1908736 alloc=466*4096
pcpu-alloc: [0] 0 [0] 1
Built 1 zonelists in Zone order, mobility grouping on.  Total pages: 32256
Kernel command line:
init=/home/luto/devel/virtme/virtme/guest/virtme-init
psmouse.proto=exps "virtme_stty_con=rows 24 cols 150 iutf8"
TERM=xterm-256color rootfstype=9p
rootflags=ro,version=9p2000.L,trans=virtio,access=any
raid=noautodetect debug
PID hash table entries: 512 (order: 0, 4096 bytes)
Dentry cache hash table entries: 16384 (order: 5, 131072 bytes)
Inode-cache hash table entries: 8192 (order: 4, 65536 bytes)
Memory: 92552K/131072K available (8229K kernel code, 798K rwdata,
3856K rodata, 2384K init, 14382K bss, 38520K reserved, 0K
cma-reserved)
Write protected kernel read-only data: 0x10 - 0xccdfff
SLUB: HWalign=256, Order=0-3, MinObjects=0, CPUs=2, Nodes=1
Running RCU self tests
Hierarchical RCU implementation.
RCU debugfs-based tracing is enabled.
RCU lockdep checking is enabled.
Build-time adjustment of leaf fanout to 64.
RCU restricting CPUs from NR_CPUS=256 to nr_cpu_ids=2.
RCU: Adjusting geometry for rcu_fanout_leaf=64, nr_cpu_ids=2
NR_IRQS:3
clocksource: tod: mask: 0x max_cycles:
0x3b0a9be803b0a9, max_idle_ns: 1805497147909793 ns
console [ttyS1] enabled
Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar
... MAX_LOCKDEP_SUBCLASSES:  8
... MAX_LOCK_DEPTH:  48
... MAX_LOCKDEP_KEYS:8191
... CLASSHASH_SIZE:  4096
... MAX_LOCKDEP_ENTRIES: 32768
... MAX_LOCKDEP_CHAINS:

Re: [PATCH] vfio/type1: handle case where IOMMU does not support PAGE_SIZE size

2015-10-29 Thread Will Deacon

On Thu, Oct 29, 2015 at 01:59:45PM +, Eric Auger wrote:
> Current vfio_pgsize_bitmap code hides the supported IOMMU page
> sizes smaller than PAGE_SIZE. As a result, in case the IOMMU
> does not support PAGE_SIZE page, the alignment check on map/unmap
> is done with larger page sizes, if any. This can fail although
> mapping could be done with pages smaller than PAGE_SIZE.
> 
> This patch modifies vfio_pgsize_bitmap implementation so that,
> in case the IOMMU supports page sizes smaller than PAGE_HOST
> we pretend PAGE_HOST is supported and hide sub-PAGE_HOST sizes.
> That way the user will be able to map/unmap buffers whose size/
> start address is aligned with PAGE_HOST. Pinning code uses that
> granularity while iommu driver can use the sub-PAGE_HOST size
> to map the buffer.
> 
> Signed-off-by: Eric Auger 
> Signed-off-by: Alex Williamson 
> 
> ---
> 
> This was tested on AMD Seattle with 64kB page host. ARM MMU 401
> currently expose 4kB, 2MB and 1GB page support. With a 64kB page host,
> the map/unmap check is done against 2MB. Some alignment check fail
> so VFIO_IOMMU_MAP_DMA fail while we could map using 4kB IOMMU page
> size.
> 
> RFC -> PATCH v1:
> - move all modifications in vfio_pgsize_bitmap following Alex'
>   suggestion to expose a fake PAGE_HOST support
> - restore WARN_ON's
> ---
>  drivers/vfio/vfio_iommu_type1.c | 15 ++-
>  1 file changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 57d8c37..cee504a 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -403,13 +403,26 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, 
> struct vfio_dma *dma)
>  static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
>  {
>   struct vfio_domain *domain;
> - unsigned long bitmap = PAGE_MASK;
> + unsigned long bitmap = ULONG_MAX;
>  
>   mutex_lock(>lock);
>   list_for_each_entry(domain, >domain_list, next)
>   bitmap &= domain->domain->ops->pgsize_bitmap;
>   mutex_unlock(>lock);
>  
> + /*
> +  * In case the IOMMU supports page sizes smaller than PAGE_HOST
> +  * we pretend PAGE_HOST is supported and hide sub-PAGE_HOST sizes.
> +  * That way the user will be able to map/unmap buffers whose size/
> +  * start address is aligned with PAGE_HOST. Pinning code uses that
> +  * granularity while iommu driver can use the sub-PAGE_HOST size
> +  * to map the buffer.
> +  */
> + if (bitmap & ~PAGE_MASK) {
> + bitmap &= PAGE_MASK;
> + bitmap |= PAGE_SIZE;
> + }
> +

s/PAGE_HOST/PAGE_SIZE/g (in the cover-letter too) and then I think this
looks good:

  Acked-by: Will Deacon 

Will
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 5/6] virtio_mmio: Use the DMA API

This switches to vring_create_virtqueue, simplifying the driver and
adding DMA API support.

Signed-off-by: Andy Lutomirski 
---
 drivers/virtio/virtio_mmio.c | 67 ++--
 1 file changed, 15 insertions(+), 52 deletions(-)

diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index f499d9da7237..2b9fab52a3cb 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -99,12 +99,6 @@ struct virtio_mmio_vq_info {
/* the actual virtqueue */
struct virtqueue *vq;
 
-   /* the number of entries in the queue */
-   unsigned int num;
-
-   /* the virtual address of the ring queue */
-   void *queue;
-
/* the list node for the virtqueues list */
struct list_head node;
 };
@@ -322,15 +316,13 @@ static void vm_del_vq(struct virtqueue *vq)
 {
struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
struct virtio_mmio_vq_info *info = vq->priv;
-   unsigned long flags, size;
+   unsigned long flags;
unsigned int index = vq->index;
 
spin_lock_irqsave(_dev->lock, flags);
list_del(>node);
spin_unlock_irqrestore(_dev->lock, flags);
 
-   vring_del_virtqueue(vq);
-
/* Select and deactivate the queue */
writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
if (vm_dev->version == 1) {
@@ -340,8 +332,8 @@ static void vm_del_vq(struct virtqueue *vq)
WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY));
}
 
-   size = PAGE_ALIGN(vring_size(info->num, VIRTIO_MMIO_VRING_ALIGN));
-   free_pages_exact(info->queue, size);
+   vring_del_virtqueue(vq);
+
kfree(info);
 }
 
@@ -356,8 +348,6 @@ static void vm_del_vqs(struct virtio_device *vdev)
free_irq(platform_get_irq(vm_dev->pdev, 0), vm_dev);
 }
 
-
-
 static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned 
index,
  void (*callback)(struct virtqueue *vq),
  const char *name)
@@ -365,7 +355,8 @@ static struct virtqueue *vm_setup_vq(struct virtio_device 
*vdev, unsigned index,
struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
struct virtio_mmio_vq_info *info;
struct virtqueue *vq;
-   unsigned long flags, size;
+   unsigned long flags;
+   unsigned int num;
int err;
 
if (!name)
@@ -388,66 +379,40 @@ static struct virtqueue *vm_setup_vq(struct virtio_device 
*vdev, unsigned index,
goto error_kmalloc;
}
 
-   /* Allocate pages for the queue - start with a queue as big as
-* possible (limited by maximum size allowed by device), drop down
-* to a minimal size, just big enough to fit descriptor table
-* and two rings (which makes it "alignment_size * 2")
-*/
-   info->num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX);
-
-   /* If the device reports a 0 entry queue, we won't be able to
-* use it to perform I/O, and vring_new_virtqueue() can't create
-* empty queues anyway, so don't bother to set up the device.
-*/
-   if (info->num == 0) {
+   num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX);
+   if (num == 0) {
err = -ENOENT;
-   goto error_alloc_pages;
-   }
-
-   while (1) {
-   size = PAGE_ALIGN(vring_size(info->num,
-   VIRTIO_MMIO_VRING_ALIGN));
-   /* Did the last iter shrink the queue below minimum size? */
-   if (size < VIRTIO_MMIO_VRING_ALIGN * 2) {
-   err = -ENOMEM;
-   goto error_alloc_pages;
-   }
-
-   info->queue = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
-   if (info->queue)
-   break;
-
-   info->num /= 2;
+   goto error_new_virtqueue;
}
 
/* Create the vring */
-   vq = vring_new_virtqueue(index, info->num, VIRTIO_MMIO_VRING_ALIGN, 
vdev,
-true, info->queue, vm_notify, callback, name);
+   vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev,
+true, true, vm_notify, callback, name);
if (!vq) {
err = -ENOMEM;
goto error_new_virtqueue;
}
 
/* Activate the queue */
-   writel(info->num, vm_dev->base + VIRTIO_MMIO_QUEUE_NUM);
+   writel(virtqueue_get_vring_size(vq), vm_dev->base + 
VIRTIO_MMIO_QUEUE_NUM);
if (vm_dev->version == 1) {
writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN);
-   writel(virt_to_phys(info->queue) >> PAGE_SHIFT,
+   writel(virtqueue_get_desc_addr(vq) >> PAGE_SHIFT,
vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
} else {

[PATCH v4 6/6] virtio_pci: Use the DMA API

This switches to vring_create_virtqueue, simplifying the driver and
adding DMA API support.

Signed-off-by: Andy Lutomirski 
---
 drivers/virtio/virtio_pci_common.h |  7 -
 drivers/virtio/virtio_pci_legacy.c | 39 +++-
 drivers/virtio/virtio_pci_modern.c | 61 ++
 3 files changed, 19 insertions(+), 88 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.h 
b/drivers/virtio/virtio_pci_common.h
index cd6196b513ad..1a3c689d1b9e 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -35,13 +35,6 @@ struct virtio_pci_vq_info {
/* the actual virtqueue */
struct virtqueue *vq;
 
-   /* the number of entries in the queue */
-   int num;
-
-   /* the ring queue */
-   void *queue;
-   dma_addr_t queue_dma_addr;  /* bus address */
-
/* the list node for the virtqueues list */
struct list_head node;
 
diff --git a/drivers/virtio/virtio_pci_legacy.c 
b/drivers/virtio/virtio_pci_legacy.c
index b5293e5f2af4..8c4e61783441 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -119,7 +119,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device 
*vp_dev,
  u16 msix_vec)
 {
struct virtqueue *vq;
-   unsigned long size;
u16 num;
int err;
 
@@ -131,29 +130,19 @@ static struct virtqueue *setup_vq(struct 
virtio_pci_device *vp_dev,
if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN))
return ERR_PTR(-ENOENT);
 
-   info->num = num;
info->msix_vector = msix_vec;
 
-   size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
-   info->queue = dma_zalloc_coherent(_dev->pci_dev->dev, size,
- >queue_dma_addr,
- GFP_KERNEL);
-   if (info->queue == NULL)
+   /* create the vring */
+   vq = vring_create_virtqueue(index, num,
+   VIRTIO_PCI_VRING_ALIGN, _dev->vdev,
+   true, false, vp_notify, callback, name);
+   if (!vq)
return ERR_PTR(-ENOMEM);
 
/* activate the queue */
-   iowrite32(info->queue_dma_addr >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
+   iowrite32(virtqueue_get_desc_addr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
-   /* create the vring */
-   vq = vring_new_virtqueue(index, info->num,
-VIRTIO_PCI_VRING_ALIGN, _dev->vdev,
-true, info->queue, vp_notify, callback, name);
-   if (!vq) {
-   err = -ENOMEM;
-   goto out_activate_queue;
-   }
-
vq->priv = (void __force *)vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY;
 
if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
@@ -161,18 +150,15 @@ static struct virtqueue *setup_vq(struct 
virtio_pci_device *vp_dev,
msix_vec = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
err = -EBUSY;
-   goto out_assign;
+   goto out_deactivate;
}
}
 
return vq;
 
-out_assign:
-   vring_del_virtqueue(vq);
-out_activate_queue:
+out_deactivate:
iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
-   dma_free_coherent(_dev->pci_dev->dev, size,
- info->queue, info->queue_dma_addr);
+   vring_del_virtqueue(vq);
return ERR_PTR(err);
 }
 
@@ -180,7 +166,6 @@ static void del_vq(struct virtio_pci_vq_info *info)
 {
struct virtqueue *vq = info->vq;
struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
-   unsigned long size;
 
iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
 
@@ -191,14 +176,10 @@ static void del_vq(struct virtio_pci_vq_info *info)
ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
}
 
-   vring_del_virtqueue(vq);
-
/* Select and deactivate the queue */
iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
-   size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN));
-   dma_free_coherent(_dev->pci_dev->dev, size,
- info->queue, info->queue_dma_addr);
+   vring_del_virtqueue(vq);
 }
 
 static const struct virtio_config_ops virtio_pci_config_ops = {
diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index fbe0bd1c4881..50b0cd5a501e 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -287,35 +287,6 @@ static u16 vp_config_vector(struct virtio_pci_device 
*vp_dev, u16 vector)
return vp_ioread16(_dev->common->msix_config);
 }
 
-static size_t vring_pci_size(u16 num)
-{
-   /* We only need a cacheline

[PATCH v4 1/6] virtio-net: Stop doing DMA from the stack

From: "Michael S. Tsirkin" 

Once virtio starts using the DMA API, we won't be able to safely DMA
from the stack.  virtio-net does a couple of config DMA requests
from small stack buffers -- switch to using dynamically-allocated
memory.

This should have no effect on any performance-critical code paths.

[I wrote the subject and commit message.  mst wrote the code. --luto]

Signed-off-by: Andy Lutomirski 
signed-off-by: Michael S. Tsirkin 
---
 drivers/net/virtio_net.c | 34 +++---
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index d8838dedb7a4..f94ab786088f 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -140,6 +140,12 @@ struct virtnet_info {
 
/* CPU hot plug notifier */
struct notifier_block nb;
+
+   /* Control VQ buffers: protected by the rtnl lock */
+   struct virtio_net_ctrl_hdr ctrl_hdr;
+   virtio_net_ctrl_ack ctrl_status;
+   u8 ctrl_promisc;
+   u8 ctrl_allmulti;
 };
 
 struct padded_vnet_hdr {
@@ -976,31 +982,30 @@ static bool virtnet_send_command(struct virtnet_info *vi, 
u8 class, u8 cmd,
 struct scatterlist *out)
 {
struct scatterlist *sgs[4], hdr, stat;
-   struct virtio_net_ctrl_hdr ctrl;
-   virtio_net_ctrl_ack status = ~0;
unsigned out_num = 0, tmp;
 
/* Caller should know better */
BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
 
-   ctrl.class = class;
-   ctrl.cmd = cmd;
+   vi->ctrl_status = ~0;
+   vi->ctrl_hdr.class = class;
+   vi->ctrl_hdr.cmd = cmd;
/* Add header */
-   sg_init_one(, , sizeof(ctrl));
+   sg_init_one(, >ctrl_hdr, sizeof(vi->ctrl_hdr));
sgs[out_num++] = 
 
if (out)
sgs[out_num++] = out;
 
/* Add return status. */
-   sg_init_one(, , sizeof(status));
+   sg_init_one(, >ctrl_status, sizeof(vi->ctrl_status));
sgs[out_num] = 
 
BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
 
if (unlikely(!virtqueue_kick(vi->cvq)))
-   return status == VIRTIO_NET_OK;
+   return vi->ctrl_status == VIRTIO_NET_OK;
 
/* Spin for a response, the kick causes an ioport write, trapping
 * into the hypervisor, so the request should be handled immediately.
@@ -1009,7 +1014,7 @@ static bool virtnet_send_command(struct virtnet_info *vi, 
u8 class, u8 cmd,
   !virtqueue_is_broken(vi->cvq))
cpu_relax();
 
-   return status == VIRTIO_NET_OK;
+   return vi->ctrl_status == VIRTIO_NET_OK;
 }
 
 static int virtnet_set_mac_address(struct net_device *dev, void *p)
@@ -1151,7 +1156,6 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 {
struct virtnet_info *vi = netdev_priv(dev);
struct scatterlist sg[2];
-   u8 promisc, allmulti;
struct virtio_net_ctrl_mac *mac_data;
struct netdev_hw_addr *ha;
int uc_count;
@@ -1163,22 +1167,22 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
return;
 
-   promisc = ((dev->flags & IFF_PROMISC) != 0);
-   allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
+   vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0);
+   vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
 
-   sg_init_one(sg, , sizeof(promisc));
+   sg_init_one(sg, >ctrl_promisc, sizeof(vi->ctrl_promisc));
 
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
  VIRTIO_NET_CTRL_RX_PROMISC, sg))
dev_warn(>dev, "Failed to %sable promisc mode.\n",
-promisc ? "en" : "dis");
+vi->ctrl_promisc ? "en" : "dis");
 
-   sg_init_one(sg, , sizeof(allmulti));
+   sg_init_one(sg, >ctrl_allmulti, sizeof(vi->ctrl_allmulti));
 
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
  VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
dev_warn(>dev, "Failed to %sable allmulti mode.\n",
-allmulti ? "en" : "dis");
+vi->ctrl_allmulti ? "en" : "dis");
 
uc_count = netdev_uc_count(dev);
mc_count = netdev_mc_count(dev);
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 3/6] virtio_pci: Use the DMA API

This fixes virtio-pci on platforms and busses that have IOMMUs.  This
will break the experimental QEMU Q35 IOMMU support until QEMU is
fixed.  In exchange, it fixes physical virtio hardware as well as
virtio-pci running under Xen.

We should clean up the virtqueue API to do its own allocation and
teach virtqueue_get_avail and virtqueue_get_used to return DMA
addresses directly.

Signed-off-by: Andy Lutomirski 
---
 drivers/virtio/virtio_pci_common.h |  3 ++-
 drivers/virtio/virtio_pci_legacy.c | 19 +++
 drivers/virtio/virtio_pci_modern.c | 34 --
 3 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.h 
b/drivers/virtio/virtio_pci_common.h
index b976d968e793..cd6196b513ad 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -38,8 +38,9 @@ struct virtio_pci_vq_info {
/* the number of entries in the queue */
int num;
 
-   /* the virtual address of the ring queue */
+   /* the ring queue */
void *queue;
+   dma_addr_t queue_dma_addr;  /* bus address */
 
/* the list node for the virtqueues list */
struct list_head node;
diff --git a/drivers/virtio/virtio_pci_legacy.c 
b/drivers/virtio/virtio_pci_legacy.c
index 48bc9797e530..b5293e5f2af4 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -135,12 +135,14 @@ static struct virtqueue *setup_vq(struct 
virtio_pci_device *vp_dev,
info->msix_vector = msix_vec;
 
size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
-   info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
+   info->queue = dma_zalloc_coherent(_dev->pci_dev->dev, size,
+ >queue_dma_addr,
+ GFP_KERNEL);
if (info->queue == NULL)
return ERR_PTR(-ENOMEM);
 
/* activate the queue */
-   iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
+   iowrite32(info->queue_dma_addr >> VIRTIO_PCI_QUEUE_ADDR_SHIFT,
  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
/* create the vring */
@@ -169,7 +171,8 @@ out_assign:
vring_del_virtqueue(vq);
 out_activate_queue:
iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
-   free_pages_exact(info->queue, size);
+   dma_free_coherent(_dev->pci_dev->dev, size,
+ info->queue, info->queue_dma_addr);
return ERR_PTR(err);
 }
 
@@ -194,7 +197,8 @@ static void del_vq(struct virtio_pci_vq_info *info)
iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN));
-   free_pages_exact(info->queue, size);
+   dma_free_coherent(_dev->pci_dev->dev, size,
+ info->queue, info->queue_dma_addr);
 }
 
 static const struct virtio_config_ops virtio_pci_config_ops = {
@@ -227,6 +231,13 @@ int virtio_pci_legacy_probe(struct virtio_pci_device 
*vp_dev)
return -ENODEV;
}
 
+   rc = dma_set_mask_and_coherent(_dev->dev, DMA_BIT_MASK(64));
+   if (rc)
+   rc = dma_set_mask_and_coherent(_dev->dev,
+   DMA_BIT_MASK(32));
+   if (rc)
+   dev_warn(_dev->dev, "Failed to enable 64-bit or 32-bit DMA. 
 Trying to continue, but this might not work.\n");
+
rc = pci_request_region(pci_dev, 0, "virtio-pci-legacy");
if (rc)
return rc;
diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index 8e5cf194cc0b..fbe0bd1c4881 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -293,14 +293,16 @@ static size_t vring_pci_size(u16 num)
return PAGE_ALIGN(vring_size(num, SMP_CACHE_BYTES));
 }
 
-static void *alloc_virtqueue_pages(int *num)
+static void *alloc_virtqueue_pages(struct virtio_pci_device *vp_dev,
+  int *num, dma_addr_t *dma_addr)
 {
void *pages;
 
/* TODO: allocate each queue chunk individually */
for (; *num && vring_pci_size(*num) > PAGE_SIZE; *num /= 2) {
-   pages = alloc_pages_exact(vring_pci_size(*num),
- GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
+   pages = dma_zalloc_coherent(
+   _dev->pci_dev->dev, vring_pci_size(*num),
+   dma_addr, GFP_KERNEL|__GFP_NOWARN);
if (pages)
return pages;
}
@@ -309,7 +311,9 @@ static void *alloc_virtqueue_pages(int *num)
return NULL;
 
/* Try to get a single page. You are my only hope! */
-   return alloc_pages_exact(vring_pci_size(*num), GFP_KERNEL|__GFP_ZERO);
+   return dma_zalloc_coherent(
+   _dev->pci_dev->dev,

[PATCH v4 4/6] virtio: Add improved queue allocation API

This leaves vring_new_virtqueue alone for compatbility, but it
adds two new improved APIs:

vring_create_virtqueue: Creates a virtqueue backed by automatically
allocated coherent memory.  (Some day it this could be extended to
support non-coherent memory, too, if there ends up being a platform
on which it's worthwhile.)

__vring_new_virtqueue: Creates a virtqueue with a manually-specified
layout.  This should allow mic_virtio to work much more cleanly.

Signed-off-by: Andy Lutomirski 
---
 drivers/virtio/virtio_ring.c | 164 +++
 include/linux/virtio.h   |  23 +-
 include/linux/virtio_ring.h  |  35 +
 3 files changed, 190 insertions(+), 32 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index a872eb89587f..f269e1cba00c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -91,6 +91,11 @@ struct vring_virtqueue
/* How to notify other side. FIXME: commonalize hcalls! */
bool (*notify)(struct virtqueue *vq);
 
+   /* DMA, allocation, and size information */
+   bool we_own_ring;
+   size_t queue_size_in_bytes;
+   dma_addr_t queue_dma_addr;
+
 #ifdef DEBUG
/* They're supposed to lock for us. */
unsigned int in_use;
@@ -821,36 +826,31 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
 }
 EXPORT_SYMBOL_GPL(vring_interrupt);
 
-struct virtqueue *vring_new_virtqueue(unsigned int index,
- unsigned int num,
- unsigned int vring_align,
- struct virtio_device *vdev,
- bool weak_barriers,
- void *pages,
- bool (*notify)(struct virtqueue *),
- void (*callback)(struct virtqueue *),
- const char *name)
+struct virtqueue *__vring_new_virtqueue(unsigned int index,
+   struct vring vring,
+   struct virtio_device *vdev,
+   bool weak_barriers,
+   bool (*notify)(struct virtqueue *),
+   void (*callback)(struct virtqueue *),
+   const char *name)
 {
-   struct vring_virtqueue *vq;
unsigned int i;
+   struct vring_virtqueue *vq;
 
-   /* We assume num is a power of 2. */
-   if (num & (num - 1)) {
-   dev_warn(>dev, "Bad virtqueue length %u\n", num);
-   return NULL;
-   }
-
-   vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
+   vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
 GFP_KERNEL);
if (!vq)
return NULL;
 
-   vring_init(>vring, num, pages, vring_align);
+   vq->vring = vring;
vq->vq.callback = callback;
vq->vq.vdev = vdev;
vq->vq.name = name;
-   vq->vq.num_free = num;
+   vq->vq.num_free = vring.num;
vq->vq.index = index;
+   vq->we_own_ring = false;
+   vq->queue_dma_addr = 0;
+   vq->queue_size_in_bytes = 0;
vq->notify = notify;
vq->weak_barriers = weak_barriers;
vq->broken = false;
@@ -871,18 +871,105 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 
/* Put everything in free lists. */
vq->free_head = 0;
-   for (i = 0; i < num-1; i++)
+   for (i = 0; i < vring.num-1; i++)
vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
-   memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
+   memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
 
return >vq;
 }
+EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
+
+struct virtqueue *vring_create_virtqueue(
+   unsigned int index,
+   unsigned int num,
+   unsigned int vring_align,
+   struct virtio_device *vdev,
+   bool weak_barriers,
+   bool may_reduce_num,
+   bool (*notify)(struct virtqueue *),
+   void (*callback)(struct virtqueue *),
+   const char *name)
+{
+   struct virtqueue *vq;
+   void *queue;
+   dma_addr_t dma_addr;
+   size_t queue_size_in_bytes;
+   struct vring vring;
+
+   /* We assume num is a power of 2. */
+   if (num & (num - 1)) {
+   dev_warn(>dev, "Bad virtqueue length %u\n", num);
+   return NULL;
+   }
+
+   /* TODO: allocate each queue chunk individually */
+   for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
+   queue = dma_zalloc_coherent(
+   vdev->dev.parent, vring_size(num, vring_align),
+   _addr, GFP_KERNEL|__GFP_NOWARN);
+   if (queue)
+

[PATCH v4 0/6] virtio core DMA API conversion

This switches virtio to use the DMA API unconditionally.  I'm sure
it breaks things, but it seems to work on x86 using virtio-pci, with
and without Xen, and using both the modern 1.0 variant and the
legacy variant.

This appears to work on native and Xen x86_64 using both modern and
legacy virtio-pci.  It also appears to work on arm and arm64.

It definitely won't work as-is on s390x, and I haven't been able to
test Christian's patches because I can't get virtio-ccw to work in
QEMU at all.  I don't know what I'm doing wrong.

It doesn't work on ppc64.  Ben, consider yourself pinged to send me
a patch :)

It doesn't work on sparc64.  I didn't realize at Kernel Summit that
sparc64 has the same problem as ppc64.

DaveM, for background, we're trying to fix virtio to use the DMA
API.  That will require that every platform that uses virtio
supplies valid DMA operations on devices that use virtio_ring.
Unfortunately, QEMU historically ignores the IOMMU on virtio
devices.

On x86, this isn't really a problem.  x86 has a nice way for the
platform to describe which devices are behind an IOMMU, and QEMU
will be adjusted accordingly.  The only thing that will break is a
recently-added experimental mode.

Ben's plan for powerpc is to add a quirk for existing virtio-pci
devices and to eventually update the devicetree stuff to allow QEMU
to tell the guest which devices use the IOMMU.

AFAICT sparc has a similar problem to powerpc.  DaveM, can you come
up with a straightforward way to get sparc's DMA API to work
correctly for virtio-pci devices?

NB: Sadly, the platforms I've successfully tested on don't include any
big-endian platforms, so there could still be lurking endian problems.

Changes from v3:
 - More big-endian fixes.
 - Added better virtio-ring APIs that handle allocation and use them in
   virtio-mmio and virtio-pci.
 - Switch to Michael's virtio-net patch.

Changes from v2:
 - Fix vring_mapping_error incorrect argument

Changes from v1:
 - Fix an endian conversion error causing a BUG to hit.
 - Fix a DMA ordering issue (swiotlb=force works now).
 - Minor cleanups.

Andy Lutomirski (5):
  virtio_ring: Support DMA APIs
  virtio_pci: Use the DMA API
  virtio: Add improved queue allocation API
  virtio_mmio: Use the DMA API
  virtio_pci: Use the DMA API

Michael S. Tsirkin (1):
  virtio-net: Stop doing DMA from the stack

 drivers/net/virtio_net.c   |  34 ++--
 drivers/virtio/Kconfig |   2 +-
 drivers/virtio/virtio_mmio.c   |  67 ++-
 drivers/virtio/virtio_pci_common.h |   6 -
 drivers/virtio/virtio_pci_legacy.c |  42 ++---
 drivers/virtio/virtio_pci_modern.c |  61 ++-
 drivers/virtio/virtio_ring.c   | 348 ++---
 include/linux/virtio.h |  23 ++-
 include/linux/virtio_ring.h|  35 
 tools/virtio/linux/dma-mapping.h   |  17 ++
 10 files changed, 426 insertions(+), 209 deletions(-)
 create mode 100644 tools/virtio/linux/dma-mapping.h

-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v4 2/6] virtio_ring: Support DMA APIs

virtio_ring currently sends the device (usually a hypervisor)
physical addresses of its I/O buffers.  This is okay when DMA
addresses and physical addresses are the same thing, but this isn't
always the case.  For example, this never works on Xen guests, and
it is likely to fail if a physical "virtio" device ever ends up
behind an IOMMU or swiotlb.

The immediate use case for me is to enable virtio on Xen guests.
For that to work, we need vring to support DMA address translation
as well as a corresponding change to virtio_pci or to another
driver.

With this patch, if enabled, virtfs survives kmemleak and
CONFIG_DMA_API_DEBUG.

Signed-off-by: Andy Lutomirski 
---
 drivers/virtio/Kconfig   |   2 +-
 drivers/virtio/virtio_ring.c | 190 +++
 tools/virtio/linux/dma-mapping.h |  17 
 3 files changed, 172 insertions(+), 37 deletions(-)
 create mode 100644 tools/virtio/linux/dma-mapping.h

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index cab9f3f63a38..77590320d44c 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -60,7 +60,7 @@ config VIRTIO_INPUT
 
  config VIRTIO_MMIO
tristate "Platform bus driver for memory mapped virtio devices"
-   depends on HAS_IOMEM
+   depends on HAS_IOMEM && HAS_DMA
select VIRTIO
---help---
 This drivers provides support for memory mapped virtio
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 096b857e7b75..a872eb89587f 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
@@ -54,7 +55,14 @@
 #define END_USE(vq)
 #endif
 
-struct vring_virtqueue {
+struct vring_desc_state
+{
+   void *data; /* Data for callback. */
+   struct vring_desc *indir_desc;  /* Indirect descriptor, if any. */
+};
+
+struct vring_virtqueue
+{
struct virtqueue vq;
 
/* Actual memory layout for this queue */
@@ -92,12 +100,71 @@ struct vring_virtqueue {
ktime_t last_add_time;
 #endif
 
-   /* Tokens for callbacks. */
-   void *data[];
+   /* Per-descriptor state. */
+   struct vring_desc_state desc_state[];
 };
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
+/*
+ * The DMA ops on various arches are rather gnarly right now, and
+ * making all of the arch DMA ops work on the vring device itself
+ * is a mess.  For now, we use the parent device for DMA ops.
+ */
+struct device *vring_dma_dev(const struct vring_virtqueue *vq)
+{
+   return vq->vq.vdev->dev.parent;
+}
+
+/* Map one sg entry. */
+static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
+  struct scatterlist *sg,
+  enum dma_data_direction direction)
+{
+   /*
+* We can't use dma_map_sg, because we don't use scatterlists in
+* the way it expects (we don't guarantee that the scatterlist
+* will exist for the lifetime of the mapping).
+*/
+   return dma_map_page(vring_dma_dev(vq),
+   sg_page(sg), sg->offset, sg->length,
+   direction);
+}
+
+static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
+  void *cpu_addr, size_t size,
+  enum dma_data_direction direction)
+{
+   return dma_map_single(vring_dma_dev(vq),
+ cpu_addr, size, direction);
+}
+
+static void vring_unmap_one(const struct vring_virtqueue *vq,
+   struct vring_desc *desc)
+{
+   u16 flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+
+   if (flags & VRING_DESC_F_INDIRECT) {
+   dma_unmap_single(vring_dma_dev(vq),
+virtio64_to_cpu(vq->vq.vdev, desc->addr),
+virtio32_to_cpu(vq->vq.vdev, desc->len),
+(flags & VRING_DESC_F_WRITE) ?
+DMA_FROM_DEVICE : DMA_TO_DEVICE);
+   } else {
+   dma_unmap_page(vring_dma_dev(vq),
+  virtio64_to_cpu(vq->vq.vdev, desc->addr),
+  virtio32_to_cpu(vq->vq.vdev, desc->len),
+  (flags & VRING_DESC_F_WRITE) ?
+  DMA_FROM_DEVICE : DMA_TO_DEVICE);
+   }
+}
+
+static int vring_mapping_error(const struct vring_virtqueue *vq,
+  dma_addr_t addr)
+{
+   return dma_mapping_error(vring_dma_dev(vq), addr);
+}
+
 static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
 unsigned int total_sg, gfp_t gfp)
 {
@@ -131,7 +198,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
struct

Re: [PATCH v2 1/3] virtio_net: Stop doing DMA from the stack

On Wed, Oct 28, 2015 at 12:07 AM, Michael S. Tsirkin  wrote:
> How about this instead? Less code, more robust.
>
> Warning: untested.  If you do like this approach, Tested-by would be
> appreciated.

I like it.

Tested-by: Andy Lutomirski 

--Andy
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/6] KVM: PPC: Book3S HV: Send IPI to host core to wake VCPU

This patch adds support to real-mode KVM to search for a core
running in the host partition and send it an IPI message with
VCPU to be woken. This avoids having to switch to the host
partition to complete an H_IPI hypercall when the VCPU which
is the target of the the H_IPI is not loaded (is not running
in the guest).

The patch also includes the support in the IPI handler running
in the host to do the wakeup by calling kvmppc_xics_ipi_action
for the PPC_MSG_RM_HOST_ACTION message.

When a guest is being destroyed, we need to ensure that there
are no pending IPIs waiting to wake up a VCPU before we free
the VCPUs of the guest. This is accomplished by:
- Forces a PPC_MSG_CALL_FUNCTION IPI to be completed by all CPUs
  before freeing any VCPUs in kvm_arch_destroy_vm()
- Any PPC_MSG_RM_HOST_ACTION messages must be executed first
  before any other PPC_MSG_CALL_FUNCTION messages

Signed-off-by: Suresh Warrier 
---
 arch/powerpc/kernel/smp.c| 11 +
 arch/powerpc/kvm/book3s_hv_rm_xics.c | 81 ++--
 arch/powerpc/kvm/powerpc.c   | 10 +
 3 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c07bfad..8958c2a 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -280,6 +280,17 @@ irqreturn_t smp_ipi_demux(void)
 
do {
all = xchg(>messages, 0);
+#if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+   /*
+* Must check for PPC_MSG_RM_HOST_ACTION messages
+* before PPC_MSG_CALL_FUNCTION messages because when
+* a VM is destroyed, we call kick_all_cpus_sync()
+* to ensure that any pending PPC_MSG_RM_HOST_ACTION
+* messages have completed before we free any VCPUs.
+*/
+   if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION))
+   kvmppc_xics_ipi_action();
+#endif
if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION))
generic_smp_call_function_interrupt();
if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c 
b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 43ffbfe..b8d6403 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -51,11 +51,70 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics,
 
 /* -- ICP routines -- */
 
+/*
+ * We start the search from our current CPU Id in the core map
+ * and go in a circle until we get back to our ID looking for a
+ * core that is running in host context and that hasn't already
+ * been targeted for another rm_host_ops.
+ *
+ * In the future, could consider using a fairer algorithm (one
+ * that distributes the IPIs better)
+ *
+ * Returns -1, if no CPU could be found in the host
+ * Else, returns a CPU Id which has been reserved for use
+ */
+static inline int grab_next_hostcore(int start,
+   struct kvmppc_host_rm_core *rm_core, int max, int action)
+{
+   bool success;
+   int core;
+   union kvmppc_rm_state old, new;
+
+   for (core = start + 1; core < max; core++)  {
+   old = new = READ_ONCE(rm_core[core].rm_state);
+
+   if (!old.in_host || old.rm_action)
+   continue;
+
+   /* Try to grab this host core if not taken already. */
+   new.rm_action = action;
+
+   success = cmpxchg64(_core[core].rm_state.raw,
+   old.raw, new.raw) == old.raw;
+   if (success) {
+   /*
+* Make sure that the store to the rm_action is made
+* visible before we return to caller (and the
+* subsequent store to rm_data) to synchronize with
+* the IPI handler.
+*/
+   smp_wmb();
+   return core;
+   }
+   }
+
+   return -1;
+}
+
+static inline int find_available_hostcore(int action)
+{
+   int core;
+   int my_core = smp_processor_id() >> threads_shift;
+   struct kvmppc_host_rm_core *rm_core = kvmppc_host_rm_ops_hv->rm_core;
+
+   core = grab_next_hostcore(my_core, rm_core, cpu_nr_cores(), action);
+   if (core == -1)
+   core = grab_next_hostcore(core, rm_core, my_core, action);
+
+   return core;
+}
+
 static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
struct kvm_vcpu *this_vcpu)
 {
struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
int cpu;
+   int hcore, hcpu;
 
/* Mark the target VCPU as having an interrupt pending */
vcpu->stat.queue_intr++;
@@ -67,11 +126,25 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
return;
}
 
-

[PATCH 1/6] KVM: PPC: Book3S HV: Host-side RM data structures

This patch defines the data structures to support the setting up
of host side operations while running in real mode in the guest,
and also the functions to allocate and free it.

The operations are for now limited to virtual XICS operations.
Currently, we have only defined one operation in the data
structure:
 - Wake up a VCPU sleeping in the host when it
   receives a virtual interrupt

The operations are assigned at the core level because PowerKVM
requires that the host run in SMT off mode. For each core,
we will need to manage its state atomically - where the state
is defined by:
1. Is the core running in the host?
2. Is there a Real Mode (RM) operation pending on the host?

Currently, core state is only managed at the whole-core level
even when the system is in split-core mode. This just limits
the number of free or "available" cores in the host to perform
any host-side operations.

The kvmppc_host_rm_core.rm_data allows any data to be passed by
KVM in real mode to the host core along with the operation to
be performed.

The kvmppc_host_rm_ops structure is allocated the very first time
a guest VM is started. Initial core state is also set - all online
cores are in the host. This structure is never deleted, not even
when there are no active guests. However, it needs to be freed
when the module is unloaded because the kvmppc_host_rm_ops_hv
can contain function pointers to kvm-hv.ko functions for the
different supported host operations.

Signed-off-by: Suresh Warrier 
---
 arch/powerpc/include/asm/kvm_ppc.h   | 31 
 arch/powerpc/kvm/book3s_hv.c | 70 
 arch/powerpc/kvm/book3s_hv_builtin.c |  3 ++
 3 files changed, 104 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index c6ef05b..47cd441 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -437,6 +437,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 {
return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
 }
+extern void kvmppc_alloc_host_rm_ops(void);
+extern void kvmppc_free_host_rm_ops(void);
 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
 extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
@@ -446,6 +448,8 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 
icpval);
 extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
 #else
+static inline void kvmppc_alloc_host_rm_ops(void) {};
+static inline void kvmppc_free_host_rm_ops(void) {};
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
{ return 0; }
 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
@@ -459,6 +463,33 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, 
u32 cmd)
{ return 0; }
 #endif
 
+/*
+ * Host-side operations we want to set up while running in real
+ * mode in the guest operating on the xics.
+ * Currently only VCPU wakeup is supported.
+ */
+
+union kvmppc_rm_state {
+   unsigned long raw;
+   struct {
+   u32 in_host;
+   u32 rm_action;
+   };
+};
+
+struct kvmppc_host_rm_core {
+   union kvmppc_rm_state rm_state;
+   void *rm_data;
+   char pad[112];
+};
+
+struct kvmppc_host_rm_ops {
+   struct kvmppc_host_rm_core  *rm_core;
+   void(*vcpu_kick)(struct kvm_vcpu *vcpu);
+};
+
+extern struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
+
 static inline unsigned long kvmppc_get_epr(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_KVM_BOOKE_HV
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 9c26c5a..9176e56 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2967,6 +2967,73 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu 
*vcpu)
goto out_srcu;
 }
 
+#ifdef CONFIG_KVM_XICS
+/*
+ * Allocate a per-core structure for managing state about which cores are
+ * running in the host versus the guest and for exchanging data between
+ * real mode KVM and CPU running in the host.
+ * This is only done for the first VM.
+ * The allocated structure stays even if all VMs have stopped.
+ * It is only freed when the kvm-hv module is unloaded.
+ * It's OK for this routine to fail, we just don't support host
+ * core operations like redirecting H_IPI wakeups.
+ */
+void kvmppc_alloc_host_rm_ops(void)
+{
+   struct kvmppc_host_rm_ops *ops;
+   unsigned long l_ops;
+   int cpu, core;
+   int size;
+
+   /* Not the first time here ? */
+   if (kvmppc_host_rm_ops_hv != NULL)
+   return;
+
+   ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
+   if (!ops)
+   return;
+
+   size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
+

[PATCH 4/6] KVM: PPC: Book3S HV: Host side kick VCPU when poked by real-mode KVM

This patch adds the support for the kick VCPU operation for
kvmppc_host_rm_ops. The kvmppc_xics_ipi_action() function
provides the function to be invoked for a host side operation
when poked by the real mode KVM. This is initiated by KVM by
sending an IPI to any free host core.

KVM real mode must set the rm_action to XICS_RM_KICK_VCPU and
rm_data to point to the VCPU to be woken up before sending the IPI.
Note that we have allocated one kvmppc_host_rm_core structure
per core. The above values need to be set in the structure
corresponding to the core to which the IPI will be sent.

Signed-off-by: Suresh Warrier 
---
 arch/powerpc/include/asm/kvm_ppc.h   |  1 +
 arch/powerpc/kvm/book3s_hv.c |  2 ++
 arch/powerpc/kvm/book3s_hv_rm_xics.c | 36 
 3 files changed, 39 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 47cd441..1b93519 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -447,6 +447,7 @@ extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
+extern void kvmppc_xics_ipi_action(void);
 #else
 static inline void kvmppc_alloc_host_rm_ops(void) {};
 static inline void kvmppc_free_host_rm_ops(void) {};
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 390af5b..80b1eb3 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3085,6 +3085,8 @@ void kvmppc_alloc_host_rm_ops(void)
ops->rm_core[core].rm_state.in_host = 1;
}
 
+   ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
+
/*
 * Make the contents of the kvmppc_host_rm_ops structure visible
 * to other CPUs before we assign it to the global variable.
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c 
b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 24f5807..43ffbfe 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "book3s_xics.h"
@@ -623,3 +624,38 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long 
xirr)
  bail:
return check_too_hard(xics, icp);
 }
+
+/*  --- Non-real mode XICS-related built-in routines ---  */
+
+/**
+ * Host Operations poked by RM KVM
+ */
+static void rm_host_ipi_action(int action, void *data)
+{
+   switch (action) {
+   case XICS_RM_KICK_VCPU:
+   kvmppc_host_rm_ops_hv->vcpu_kick(data);
+   break;
+   default:
+   WARN(1, "Unexpected rm_action=%d data=%p\n", action, data);
+   break;
+   }
+
+}
+
+void kvmppc_xics_ipi_action(void)
+{
+   int core;
+   unsigned int cpu = smp_processor_id();
+   struct kvmppc_host_rm_core *rm_corep;
+
+   core = cpu >> threads_shift;
+   rm_corep = _host_rm_ops_hv->rm_core[core];
+
+   if (rm_corep->rm_data) {
+   rm_host_ipi_action(rm_corep->rm_state.rm_action,
+   rm_corep->rm_data);
+   rm_corep->rm_data = NULL;
+   rm_corep->rm_state.rm_action = 0;
+   }
+}
-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/6] KVM: PPC: Book3S HV: Manage core host state

Update the core host state in kvmppc_host_rm_ops whenever
the primary thread of the core enters the guest or returns
back.

Signed-off-by: Suresh Warrier 
---
 arch/powerpc/kvm/book3s_hv.c | 44 
 1 file changed, 44 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 9176e56..7a62aaa 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2261,6 +2261,46 @@ static void post_guest_process(struct kvmppc_vcore *vc, 
bool is_master)
 }
 
 /*
+ * Clear core from the list of active host cores as we are about to
+ * enter the guest. Only do this if it is the primary thread of the
+ * core (not if a subcore) that is entering the guest.
+ */
+static inline void kvmppc_clear_host_core(int cpu)
+{
+   int core;
+
+   if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+   return;
+   /*
+* Memory barrier can be omitted here as we will do a smp_wmb()
+* later in kvmppc_start_thread and we need ensure that state is
+* visible to other CPUs only after we enter guest.
+*/
+   core = cpu >> threads_shift;
+   kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
+}
+
+/*
+ * Advertise this core as an active host core since we exited the guest
+ * Only need to do this if it is the primary thread of the core that is
+ * exiting.
+ */
+static inline void kvmppc_set_host_core(int cpu)
+{
+   int core;
+
+   if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+   return;
+
+   /*
+* Memory barrier can be omitted here because we do a spin_unlock
+* immediately after this which provides the memory barrier.
+*/
+   core = cpu >> threads_shift;
+   kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
+}
+
+/*
  * Run a set of guest threads on a physical core.
  * Called with vc->lock held.
  */
@@ -2372,6 +2412,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore 
*vc)
}
}
 
+   kvmppc_clear_host_core(pcpu);
+
/* Start all the threads */
active = 0;
for (sub = 0; sub < core_info.n_subcores; ++sub) {
@@ -2468,6 +2510,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore 
*vc)
kvmppc_ipi_thread(pcpu + i);
}
 
+   kvmppc_set_host_core(pcpu);
+
spin_unlock(>lock);
 
/* make sure updates to secondary vcpu structs are visible now */
-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/6] KVM: PPC: Book3S HV: kvmppc_host_rm_ops - handle offlining CPUs

The kvmppc_host_rm_ops structure keeps track of which cores are
are in the host by maintaining a bitmask of active/runnable
online CPUs that have not entered the guest. This patch adds
support to manage the bitmask when a CPU is offlined or onlined
in the host.

Signed-off-by: Suresh Warrier 
---
 arch/powerpc/kvm/book3s_hv.c | 39 +++
 1 file changed, 39 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 7a62aaa..390af5b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3012,6 +3012,36 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu 
*vcpu)
 }
 
 #ifdef CONFIG_KVM_XICS
+static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action,
+   void *hcpu)
+{
+   unsigned long cpu = (long)hcpu;
+
+   switch (action) {
+   case CPU_UP_PREPARE:
+   case CPU_UP_PREPARE_FROZEN:
+   kvmppc_set_host_core(cpu);
+   break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+   case CPU_DEAD:
+   case CPU_DEAD_FROZEN:
+   case CPU_UP_CANCELED:
+   case CPU_UP_CANCELED_FROZEN:
+   kvmppc_clear_host_core(cpu);
+   break;
+#endif
+   default:
+   break;
+   }
+
+   return NOTIFY_OK;
+}
+
+static struct notifier_block kvmppc_cpu_notifier = {
+   .notifier_call = kvmppc_cpu_notify,
+};
+
 /*
  * Allocate a per-core structure for managing state about which cores are
  * running in the host versus the guest and for exchanging data between
@@ -3045,6 +3075,8 @@ void kvmppc_alloc_host_rm_ops(void)
return;
}
 
+   get_online_cpus();
+
for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
if (!cpu_online(cpu))
continue;
@@ -3063,14 +3095,21 @@ void kvmppc_alloc_host_rm_ops(void)
l_ops = (unsigned long) ops;
 
if (cmpxchg64((unsigned long *)_host_rm_ops_hv, 0, l_ops)) {
+   put_online_cpus();
kfree(ops->rm_core);
kfree(ops);
+   return;
}
+
+   register_cpu_notifier(_cpu_notifier);
+
+   put_online_cpus();
 }
 
 void kvmppc_free_host_rm_ops(void)
 {
if (kvmppc_host_rm_ops_hv) {
+   unregister_cpu_notifier(_cpu_notifier);
kfree(kvmppc_host_rm_ops_hv->rm_core);
kfree(kvmppc_host_rm_ops_hv);
kvmppc_host_rm_ops_hv = NULL;
-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/6] KVM: PPC: Book3S HV: Optimize wakeup VCPU from H_IPI

When the VCPU target of an H_IPI hypercall is not running
in the guest, we need to do a kick VCPU (wake the VCPU thread)
to make it runnable. The real-mode version of the H_IPI hypercall
cannot do this because it involves waking a sleeping thread.
Thus the hcall returns H_TOO_HARD which forces a switch back
to host so that the H_IPI call can be completed in virtual mode.
This has been found to cause a slowdown for many workloads like
YCSB MongoDB, small message networking, etc. 

This patch set optimizes the wakeup of the target VCPU by posting
a free core already running in the host to do the wakeup, thus
avoiding the switch to host and back. It requires maintaining a
bitmask of all the available cores in the system to indicate if
they are in the host or running in some guest. It also requires
the H_IPI hypercall to search for a free host core and send it a
new IPI message PPC_MSG_RM_HOST_ACTION after stashing away some
parameters like the pointer to VCPU for the IPI handler. Locks
are avoided by using atomic operations to save core state, to
find and reserve a core in the host, etc.

Note that it is possible for a guest to be destroyed and its
VCPUs freed before the IPI handler gets to run. This case is
handled by ensuring that any pending PPC_MSG_RM_HOST_ACTION
IPIs are completed before proceeding with freeing the VCPUs.

A tunable h_ipi_redirect is also included in the patch set to
disable the feature. 

This patch set depends upon patches to powerpc to increase the
number of supported IPI messages to 8 and which also defines
the PPC_MSG_RM_HOST_ACTION message.

Suresh Warrier (6):
  KVM: PPC: Book3S HV: Host-side RM data structures
  KVM: PPC: Book3S HV: Manage core host state
  KVM: PPC: Book3S HV: kvmppc_host_rm_ops - handle offlining CPUs
  KVM: PPC: Book3S HV: Host side kick VCPU when poked by real-mode KVM
  KVM: PPC: Book3S HV: Send IPI to host core to wake VCPU
  KVM: PPC: Book3S HV: Add tunable to control H_IPI redirection

 arch/powerpc/include/asm/kvm_ppc.h   |  33 +++
 arch/powerpc/kernel/smp.c|  11 +++
 arch/powerpc/kvm/book3s_hv.c | 166 +++
 arch/powerpc/kvm/book3s_hv_builtin.c |   3 +
 arch/powerpc/kvm/book3s_hv_rm_xics.c | 120 -
 arch/powerpc/kvm/powerpc.c   |  10 +++
 6 files changed, 340 insertions(+), 3 deletions(-)

-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 6/6] KVM: PPC: Book3S HV: Add tunable to control H_IPI redirection

Redirecting the wakeup of a VCPU from the H_IPI hypercall to
a core running in the host is usually a good idea, most workloads
seemed to benefit. However, in one heavily interrupt-driven SMT1
workload, some regression was observed. This patch adds a kvm_hv
module parameter called h_ipi_redirect to control this feature.

The default value for this tunable is 1 - that is enable the feature.

Signed-off-by: Suresh Warrier 
---
 arch/powerpc/include/asm/kvm_ppc.h   |  1 +
 arch/powerpc/kvm/book3s_hv.c | 11 +++
 arch/powerpc/kvm/book3s_hv_rm_xics.c |  5 -
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 1b93519..29d1442 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -448,6 +448,7 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 
icpval);
 extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
 extern void kvmppc_xics_ipi_action(void);
+extern int h_ipi_redirect;
 #else
 static inline void kvmppc_alloc_host_rm_ops(void) {};
 static inline void kvmppc_free_host_rm_ops(void) {};
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 80b1eb3..20b2598 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -81,6 +81,17 @@ static int target_smt_mode;
 module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
 
+#ifdef CONFIG_KVM_XICS
+static struct kernel_param_ops module_param_ops = {
+   .set = param_set_int,
+   .get = param_get_int,
+};
+
+module_param_cb(h_ipi_redirect, _param_ops, _ipi_redirect,
+   S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
+#endif
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c 
b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index b8d6403..b162f41 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -24,6 +24,9 @@
 
 #define DEBUG_PASSUP
 
+int h_ipi_redirect = 1;
+EXPORT_SYMBOL(h_ipi_redirect);
+
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp 
*icp,
u32 new_irq);
 
@@ -134,7 +137,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
cpu = vcpu->arch.thread_cpu;
if (cpu < 0 || cpu >= nr_cpu_ids) {
hcore = -1;
-   if (kvmppc_host_rm_ops_hv)
+   if (kvmppc_host_rm_ops_hv && h_ipi_redirect)
hcore = find_available_hostcore(XICS_RM_KICK_VCPU);
if (hcore != -1) {
hcpu = hcore << threads_shift;
-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC Patch 00/12] IXGBE: Add live migration support for SRIOV NIC

2015-10-29 Thread Lan Tianyu

On 2015年10月26日 23:03, Alexander Duyck wrote:
> No.  I think you are missing the fact that there are 256 descriptors per
> page.  As such if you dirty just 1 you will be pulling in 255 more, of
> which you may or may not have pulled in the receive buffer for.
> 
> So for example if you have the descriptor ring size set to 256 then that
> means you are going to get whatever the descriptor ring has since you
> will be marking the entire ring dirty with every packet processed,
> however you cannot guarantee that you are going to get all of the
> receive buffers unless you go through and flush the entire ring prior to
> migrating.


Yes, that will be a problem. How about adding tag for each Rx buffer and
check the tag when deliver the Rx buffer to stack? If tag has been
overwritten, this means the packet data has been migrated.


> 
> This is why I have said you will need to do something to force the rings
> to be flushed such as initiating a PM suspend prior to migrating.  You
> need to do something to stop the DMA and flush the remaining Rx buffers
> if you want to have any hope of being able to migrate the Rx in a
> consistent state.  Beyond that the only other thing you have to worry
> about are the Rx buffers that have already been handed off to the
> stack.  However those should be handled if you do a suspend and somehow
> flag pages as dirty when they are unmapped from the DMA.
> 
> - Alex

This will be simple and maybe our first version to enable migration. But
we still hope to find a way not to disable DMA before stopping VCPU to
decrease service down time.

-- 
Best regards
Tianyu Lan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] target-i386: enable cflushopt/clwb/pcommit instructions

2015-10-29 Thread Xiao Guangrong

These instructions are used by NVDIMM drivers and the specification
locates at:
https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

There instructions are available on Skylake Server

Signed-off-by: Xiao Guangrong 
---
 target-i386/cpu.c | 8 +---
 target-i386/cpu.h | 3 +++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 05d7f26..ebecdb4 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -259,8 +259,8 @@ static const char *svm_feature_name[] = {
 static const char *cpuid_7_0_ebx_feature_name[] = {
 "fsgsbase", "tsc_adjust", NULL, "bmi1", "hle", "avx2", NULL, "smep",
 "bmi2", "erms", "invpcid", "rtm", NULL, NULL, "mpx", NULL,
-"avx512f", NULL, "rdseed", "adx", "smap", NULL, NULL, NULL,
-NULL, NULL, "avx512pf", "avx512er", "avx512cd", NULL, NULL, NULL,
+"avx512f", NULL, "rdseed", "adx", "smap", NULL, "pcommit", "clflushopt",
+"clwb", NULL, "avx512pf", "avx512er", "avx512cd", NULL, NULL, NULL,
 };
 
 static const char *cpuid_apm_edx_feature_name[] = {
@@ -345,7 +345,9 @@ static const char *cpuid_6_feature_name[] = {
 #define TCG_SVM_FEATURES 0
 #define TCG_KVM_FEATURES 0
 #define TCG_7_0_EBX_FEATURES (CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_SMAP | \
-  CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX)
+  CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX | \
+  CPUID_7_0_EBX_PCOMMIT | CPUID_7_0_EBX_CLFLUSHOPT |\
+  CPUID_7_0_EBX_CLWB)
   /* missing:
   CPUID_7_0_EBX_FSGSBASE, CPUID_7_0_EBX_HLE, CPUID_7_0_EBX_AVX2,
   CPUID_7_0_EBX_ERMS, CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM,
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 54d9d50..71ecb5c 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -573,6 +573,9 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define CPUID_7_0_EBX_RDSEED   (1U << 18)
 #define CPUID_7_0_EBX_ADX  (1U << 19)
 #define CPUID_7_0_EBX_SMAP (1U << 20)
+#define CPUID_7_0_EBX_PCOMMIT  (1U << 22) /* Persistent Commit */
+#define CPUID_7_0_EBX_CLFLUSHOPT (1U << 23) /* Flush a Cache Line Optimized */
+#define CPUID_7_0_EBX_CLWB (1U << 24) /* Cache Line Write Back */
 #define CPUID_7_0_EBX_AVX512PF (1U << 26) /* AVX-512 Prefetch */
 #define CPUID_7_0_EBX_AVX512ER (1U << 27) /* AVX-512 Exponential and 
Reciprocal */
 #define CPUID_7_0_EBX_AVX512CD (1U << 28) /* AVX-512 Conflict Detection */
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC Patch 00/12] IXGBE: Add live migration support for SRIOV NIC

2015-10-29 Thread Alexander Duyck


On 10/28/2015 11:12 PM, Lan Tianyu wrote:

On 2015年10月26日 23:03, Alexander Duyck wrote:

No.  I think you are missing the fact that there are 256 descriptors per
page.  As such if you dirty just 1 you will be pulling in 255 more, of
which you may or may not have pulled in the receive buffer for.

So for example if you have the descriptor ring size set to 256 then that
means you are going to get whatever the descriptor ring has since you
will be marking the entire ring dirty with every packet processed,
however you cannot guarantee that you are going to get all of the
receive buffers unless you go through and flush the entire ring prior to
migrating.


Yes, that will be a problem. How about adding tag for each Rx buffer and
check the tag when deliver the Rx buffer to stack? If tag has been
overwritten, this means the packet data has been migrated.


Then you have to come up with a pattern that you can guarantee is the 
tag and not part of the packet data.  That isn't going to be something 
that is easy to do.  It would also have a serious performance impact on 
the VF.



This is why I have said you will need to do something to force the rings
to be flushed such as initiating a PM suspend prior to migrating.  You
need to do something to stop the DMA and flush the remaining Rx buffers
if you want to have any hope of being able to migrate the Rx in a
consistent state.  Beyond that the only other thing you have to worry
about are the Rx buffers that have already been handed off to the
stack.  However those should be handled if you do a suspend and somehow
flag pages as dirty when they are unmapped from the DMA.

- Alex

This will be simple and maybe our first version to enable migration. But
we still hope to find a way not to disable DMA before stopping VCPU to
decrease service down time.


You have to stop the Rx DMA at some point anyway.  It is the only means 
to guarantee that the device stops updating buffers and descriptors so 
that you will have a consistent state.


Your code was having to do a bunch of shuffling in order to get things 
set up so that you could bring the interface back up.  I would argue 
that it may actually be faster at least on the bring-up to just drop the 
old rings and start over since it greatly reduced the complexity and the 
amount of device related data that has to be moved.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v1 2/2] dma-mapping-common: add DMA attribute - DMA_ATTR_IOMMU_BYPASS

2015-10-29 Thread Shamir Rabinovitch

On Thu, Oct 29, 2015 at 09:42:12AM +0900, David Woodhouse wrote:

> Aside from the lack of security, the other disadvantage of that is that
> you have to pin *all* pages of a guest in case DMA happens; you don't
> get to pin *only* those pages which are referenced by that guest's
> IOMMU page tables...

We do bypass the IOMMU but not the DMA API and given that before we call 
the DMA API we pin the page then we do not need to pin all the pages.
Just the ones we use for the DMA. 

For me this flag looks orthogonal to the page pinning issue you brought. It
is just a hint to the DMA API that we want to use simple & fast mapping while
knowing we loose IOMMU protection for this memory.

For the IB case, setting and tearing DMA mappings for the drivers data buffers
is expensive. But we could for example consider to map all the HW control 
objects
that validate the HW access to the drivers data buffers as IOMMU protected and 
so
have IOMMU protection on those critical objects while having fast 
set-up/tear-down
of driver data buffers. The HW control objects have stable mapping for the 
lifetime
of the system. So the case of using both types of DMA mappings is still valid. 

> 
> -- 
> dwmw2
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC Patch 00/12] IXGBE: Add live migration support for SRIOV NIC

2015-10-29 Thread Lan Tianyu

On 2015年10月29日 14:58, Alexander Duyck wrote:
> 
> Your code was having to do a bunch of shuffling in order to get things
> set up so that you could bring the interface back up.  I would argue
> that it may actually be faster at least on the bring-up to just drop the
> old rings and start over since it greatly reduced the complexity and the
> amount of device related data that has to be moved.

If give up the old ring after migration and keep DMA running before
stopping VCPU, it seems we don't need to track Tx/Rx descriptor ring and
just make sure that all Rx buffers delivered to stack has been migrated.

1) Dummy write Rx buffer before checking Rx descriptor to ensure packet
migrated first.

2) Make a copy of Rx descriptor and then use the copied data to check
buffer status. Not use the original descriptor because it won't be
migrated and migration may happen between two access of the Rx descriptor.

-- 
Best regards
Tianyu Lan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next rfc V2 2/2] vhost_net: basic polling support

2015-10-29 Thread Jason Wang

This patch tries to poll for new added tx buffer for a while at the
end of tx processing. The maximum time spent on polling were limited
through a module parameter. To avoid block rx, the loop will end it
there's new other works queued on vhost so in fact socket receive
queue is also be polled.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c | 54 +
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9eda69e..30e6d3d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -31,9 +31,13 @@
 #include "vhost.h"
 
 static int experimental_zcopytx = 1;
+static int busyloop_timeout = 50;
 module_param(experimental_zcopytx, int, 0444);
 MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
   " 1 -Enable; 0 - Disable");
+module_param(busyloop_timeout, int, 0444);
+MODULE_PARM_DESC(busyloop_timeout, "Maximum number of time (in us) "
+  "could be spend on busy polling");
 
 /* Max number of bytes transferred before requeueing the job.
  * Using this limit prevents one virtqueue from starving others. */
@@ -287,6 +291,49 @@ static void vhost_zerocopy_callback(struct ubuf_info 
*ubuf, bool success)
rcu_read_unlock_bh();
 }
 
+static inline unsigned long busy_clock(void)
+{
+   return local_clock() >> 10;
+}
+
+static bool tx_can_busy_poll(struct vhost_dev *dev,
+unsigned long endtime)
+{
+   return likely(!need_resched()) &&
+  likely(!time_after(busy_clock(), endtime)) &&
+  likely(!signal_pending(current)) &&
+  !vhost_has_work(dev) &&
+  single_task_running();
+}
+
+static int vhost_net_tx_get_vq_desc(struct vhost_virtqueue *vq,
+   struct iovec iov[], unsigned int iov_size,
+   unsigned int *out_num, unsigned int *in_num)
+{
+   unsigned long uninitialized_var(endtime);
+   int head;
+
+   if (busyloop_timeout) {
+   preempt_disable();
+   endtime = busy_clock() + busyloop_timeout;
+   }
+
+again:
+   head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+out_num, in_num, NULL, NULL);
+
+   if (head == vq->num && busyloop_timeout &&
+   tx_can_busy_poll(vq->dev, endtime)) {
+   cpu_relax();
+   goto again;
+   }
+
+   if (busyloop_timeout)
+   preempt_enable();
+
+   return head;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
@@ -331,10 +378,9 @@ static void handle_tx(struct vhost_net *net)
  % UIO_MAXIOV == nvq->done_idx))
break;
 
-   head = vhost_get_vq_desc(vq, vq->iov,
-ARRAY_SIZE(vq->iov),
-, ,
-NULL, NULL);
+   head = vhost_net_tx_get_vq_desc(vq, vq->iov,
+   ARRAY_SIZE(vq->iov),
+   , );
/* On error, stop handling until the next kick. */
if (unlikely(head < 0))
break;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next rfc V2 0/2] basic busy polling support for vhost_net

2015-10-29 Thread Jason Wang

Hi all:

This series tries to add basic busy polling for vhost net. The idea is
simple: at the end of tx processing, busy polling for new tx added
descriptor and rx receive socket for a while. The maximum number of
time (in us) could be spent on busy polling was specified through
module parameter.

Test were done through:

- 50 us as busy loop timeout
- Netperf 2.6
- Two machines with back to back connected mlx4
- Guest with 8 vcpus and 1 queue

Result shows very huge improvement on both tx (at most 158%) and rr
(at most 53%) while rx is as much as in the past. Most cases the cpu
utilization is also improved:

Guest TX:
size/session/+thu%/+normalize%
   64/ 1/  +17%/   +6%
   64/ 4/   +9%/  +17%
   64/ 8/  +34%/  +21%
  512/ 1/  +48%/  +40%
  512/ 4/  +31%/  +20%
  512/ 8/  +39%/  +22%
 1024/ 1/ +158%/  +99%
 1024/ 4/  +20%/  +11%
 1024/ 8/  +40%/  +18%
 2048/ 1/ +108%/  +74%
 2048/ 4/  +21%/   +7%
 2048/ 8/  +32%/  +14%
 4096/ 1/  +94%/  +77%
 4096/ 4/   +7%/   -6%
 4096/ 8/   +9%/   -4%
16384/ 1/  +33%/   +9%
16384/ 4/  +10%/   -6%
16384/ 8/  +19%/   +2%
65535/ 1/  +15%/   -6%
65535/ 4/   +8%/   -9%
65535/ 8/  +14%/0%

Guest RX:
size/session/+thu%/+normalize%
   64/ 1/   -3%/   -3%
   64/ 4/   +4%/  +20%
   64/ 8/   -1%/   -1%
  512/ 1/  +20%/  +12%
  512/ 4/   +1%/   +3%
  512/ 8/0%/   -5%
 1024/ 1/   +9%/   -2%
 1024/ 4/0%/   +5%
 1024/ 8/   +1%/0%
 2048/ 1/0%/   +3%
 2048/ 4/   -2%/   +3%
 2048/ 8/   -1%/   -3%
 4096/ 1/   -8%/   +3%
 4096/ 4/0%/   +2%
 4096/ 8/0%/   +5%
16384/ 1/   +3%/0%
16384/ 4/   +2%/   +2%
16384/ 8/0%/  +13%
65535/ 1/0%/   +3%
65535/ 4/   +2%/   -1%
65535/ 8/   +1%/  +14%

TCP_RR:
size/session/+thu%/+normalize%
1/ 1/   +8%/   -6%
1/50/  +18%/  +15%
1/   100/  +22%/  +19%
1/   200/  +25%/  +23%
   64/ 1/   +2%/  -19%
   64/50/  +46%/  +39%
   64/   100/  +47%/  +39%
   64/   200/  +50%/  +44%
  512/ 1/0%/  -28%
  512/50/  +50%/  +44%
  512/   100/  +53%/  +47%
  512/   200/  +51%/  +58%
 1024/ 1/   +3%/  -14%
 1024/50/  +45%/  +37%
 1024/   100/  +53%/  +49%
 1024/   200/  +48%/  +55%

Changes from V1:
- Add a comment for vhost_has_work() to explain why it could be
  lockless
- Add param description for busyloop_timeout
- Split out the busy polling logic into a new helper
- Check and exit the loop when there's a pending signal
- Disable preemption during busy looping to make sure lock_clock() was
  correctly used.

Todo:
- Make the busyloop timeout could be configure per VM through ioctl.

Please review.

Thanks

Jason Wang (2):
  vhost: introduce vhost_has_work()
  vhost_net: basic polling support

 drivers/vhost/net.c   | 54 +++
 drivers/vhost/vhost.c |  7 +++
 drivers/vhost/vhost.h |  1 +
 3 files changed, 58 insertions(+), 4 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next rfc V2 1/2] vhost: introduce vhost_has_work()

2015-10-29 Thread Jason Wang

This path introduces a helper which can give a hint for whether or not
there's a work queued in the work list.

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 7 +++
 drivers/vhost/vhost.h | 1 +
 2 files changed, 8 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index eec2f11..163b365 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -245,6 +245,13 @@ void vhost_work_queue(struct vhost_dev *dev, struct 
vhost_work *work)
 }
 EXPORT_SYMBOL_GPL(vhost_work_queue);
 
+/* A lockless hint for busy polling code to exit the loop */
+bool vhost_has_work(struct vhost_dev *dev)
+{
+   return !list_empty(>work_list);
+}
+EXPORT_SYMBOL_GPL(vhost_has_work);
+
 void vhost_poll_queue(struct vhost_poll *poll)
 {
vhost_work_queue(poll->dev, >work);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 4772862..ea0327d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -37,6 +37,7 @@ struct vhost_poll {
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
+bool vhost_has_work(struct vhost_dev *dev);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
 unsigned long mask, struct vhost_dev *dev);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 0/3] virtio DMA API core stuff

2015-10-29 Thread Michael S. Tsirkin

On Wed, Oct 28, 2015 at 03:51:58PM -0700, Andy Lutomirski wrote:
> On Wed, Oct 28, 2015 at 9:12 AM, Michael S. Tsirkin  wrote:
> > On Wed, Oct 28, 2015 at 11:32:34PM +0900, David Woodhouse wrote:
> >> > I don't have a problem with extending DMA API to address
> >> > more usecases.
> >>
> >> No, this isn't an extension. This is fixing a bug, on certain platforms
> >> where the DMA API has currently done the wrong thing.
> >>
> >> We have historically worked around that bug by introducing *another*
> >> bug, which is not to *use* the DMA API in the virtio driver.
> >>
> >> Sure, we can co-ordinate those two bug-fixes. But let's not talk about
> >> them as anything other than bug-fixes.
> >
> > It was pretty practical not to use it. All virtio devices at the time
> > without exception bypassed the IOMMU, so it was a question of omitting a
> > couple of function calls in virtio versus hacking on DMA implementation
> > on multiple platforms. We have more policy options now, so I agree it's
> > time to revisit this.
> >
> > But for me, the most important thing is that we do coordinate.
> >
> >> > > Drivers use DMA API. No more talky.
> >> >
> >> > Well for virtio they don't ATM. And 1:1 mapping makes perfect sense
> >> > for the wast majority of users, so I can't switch them over
> >> > until the DMA API actually addresses all existing usecases.
> >>
> >> That's still not your business; it's the platform's. And there are
> >> hardware implementations of the virtio protocols on real PCI cards. And
> >> we have the option of doing IOMMU translation for the virtio devices
> >> even in a virtual machine. Just don't get involved.
> >>
> >> --
> >> dwmw2
> >>
> >>
> >
> > I'm involved anyway, it's possible not to put all the code in the virtio
> > subsystem in guest though.  But I suspect we'll need to find a way for
> > non-linux drivers within guest to work correctly too, and they might
> > have trouble poking at things at the system level.  So possibly virtio
> > subsystem will have to tell platform "this device wants to bypass IOMMU"
> > and then DMA API does the right thing.
> >
> 
> After some discussion at KS, no one came up with an example where it's
> necessary, and the patches to convert virtqueue to use the DMA API are
> much nicer when they convert it unconditionally.

It's very surprising no one couldn't.  I did above, I try again below.
Note: below discusses configuration *within guest*.

Example: you have a mix of assigned devices and virtio devices. You
don't trust your assigned device vendor not to corrupt your memory so
you want to limit the damage your assigned device can do to your guest,
so you use an IOMMU for that.  Thus existing iommu=pt within guest is out.

But you trust your hypervisor (you have no choice anyway),
and you don't want the overhead of tweaking IOMMU
on data path for virtio. Thus iommu=on is out too.



> The two interesting cases we thought of were PPC and x86's emulated
> Q35 IOMMU.  PPC will look in to architecting a devicetree-based way to
> indicate passthrough status and will add quirks for the existing
> virtio devices.

Isn't this specified by the hypervisor? I don't think this is a good way
to do this: guest security should be up to guest.

> Everyone seems to agree that x86's emulated Q35 thing
> is just buggy right now and should be taught to use the existing ACPI
> mechanism for enumerating passthrough devices.

I'm not sure what ACPI has to do with it.
It's about a way for guest users to specify whether
they want to bypass an IOMMU for a given device.

> I'll send a new version of the series soon.
> 
> --Andy

By the way, a bunch of code is missing on the QEMU side
to make this useful:
1. virtio ignores the iommu
2. vhost user ignores the iommu
3. dataplane ignores the iommu
4. vhost-net ignores the iommu
5. VFIO ignores the iommu

I think so far I only saw patches for 1 above.

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH] VFIO: Add a parameter to force nonthread IRQ

2015-10-29 Thread Paolo Bonzini



On 29/10/2015 04:11, Alex Williamson wrote:
> > The irqfd is already able to schedule a work item, because it runs with
> > interrupts disabled, so I think we can always return IRQ_HANDLED.
>
> I'm confused by this.  The problem with adding IRQF_NO_THREAD to our
> current handler is that it hits the spinlock that can sleep in
> eventfd_signal() and the waitqueue further down the stack before we get
> to the irqfd.  So if we split to a non-threaded handler vs a threaded
> handler, where the non-threaded handler either returns IRQ_HANDLED or
> IRQ_WAKE_THREAD to queue the threaded handler, there's only so much that
> the non-threaded handler can do before we start running into the same
> problem.

You're right.  I thought schedule_work used raw spinlocks (and then
everything would be done in the inject callback), but I was wrong.

Basically where irqfd_wakeup now does schedule_work, it would need to
return IRQ_WAKE_THREAD.  The threaded handler then can just do the
eventfd_signal.

Paolo

> I think that means that the non-threaded handler needs to
> return IRQ_WAKE_THREAD if we need to use the current eventfd_signal()
> path, such as if the bypass path is not available.  If we can get
> through the bypass path and the KVM irqfd side is safe for the
> non-threaded handler, inject succeeds and we return IRQ_HANDLED, right?
> Thanks,
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 8/9] kvm/x86: Hyper-V synthetic interrupt controller

2015-10-29 Thread Roman Kagan

On Wed, Oct 28, 2015 at 06:41:45PM +0100, Paolo Bonzini wrote:
> Hi Andrey,
> 
> just one question.  Is kvm_arch_set_irq actually needed?  I think
> everything should work fine without it.  Can you check?  If so, I can
> remove it myself and revert the patch that introduced the hook.

While Andrey is testing it, I'd like to ask similar question re. MSI:
why is there a "shortcut" for KVM_IRQ_ROUTING_MSI case (which we
basically modelled after) when it would probably get handled through
->set handler in irqfd_inject() too?

Roman.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 8/9] kvm/x86: Hyper-V synthetic interrupt controller

2015-10-29 Thread Andrey Smetanin



On 10/28/2015 08:41 PM, Paolo Bonzini wrote:

Hi Andrey,

just one question.  Is kvm_arch_set_irq actually needed?  I think
everything should work fine without it.  Can you check?  If so, I can
remove it myself and revert the patch that introduced the hook.


Hi Paolo,

I have checked that Hyper-V SynIC unit test and some hand-made tests 
with Windows guest(with enabled SynIC) works fine without 
kvm_arch_set_irq. It will be nice

to remove this function.

Thanks

Paolo

On 22/10/2015 18:09, Andrey Smetanin wrote:

SynIC (synthetic interrupt controller) is a lapic extension,
which is controlled via MSRs and maintains for each vCPU
  - 16 synthetic interrupt "lines" (SINT's); each can be configured to
trigger a specific interrupt vector optionally with auto-EOI
semantics
  - a message page in the guest memory with 16 256-byte per-SINT message
slots
  - an event flag page in the guest memory with 16 2048-bit per-SINT
event flag areas

The host triggers a SINT whenever it delivers a new message to the
corresponding slot or flips an event flag bit in the corresponding area.
The guest informs the host that it can try delivering a message by
explicitly asserting EOI in lapic or writing to End-Of-Message (EOM)
MSR.

The userspace (qemu) triggers interrupts and receives EOM notifications
via irqfd with resampler; for that, a GSI is allocated for each
configured SINT, and irq_routing api is extended to support GSI-SINT
mapping.

Signed-off-by: Andrey Smetanin 
Reviewed-by: Roman Kagan 
Signed-off-by: Denis V. Lunev 
CC: Vitaly Kuznetsov 
CC: "K. Y. Srinivasan" 
CC: Gleb Natapov 
CC: Paolo Bonzini 
CC: Roman Kagan 

Changes v3:
* added KVM_CAP_HYPERV_SYNIC and KVM_IRQ_ROUTING_HV_SINT notes into
docs

Changes v2:
* do not use posted interrupts for Hyper-V SynIC AutoEOI vectors
* add Hyper-V SynIC vectors into EOI exit bitmap
* Hyper-V SyniIC SINT msr write logic simplified
---
  Documentation/virtual/kvm/api.txt |  14 ++
  arch/x86/include/asm/kvm_host.h   |  14 ++
  arch/x86/kvm/hyperv.c | 297 ++
  arch/x86/kvm/hyperv.h |  21 +++
  arch/x86/kvm/irq_comm.c   |  34 +
  arch/x86/kvm/lapic.c  |  18 ++-
  arch/x86/kvm/lapic.h  |   5 +
  arch/x86/kvm/x86.c|  12 +-
  include/linux/kvm_host.h  |   6 +
  include/uapi/linux/kvm.h  |   8 +
  10 files changed, 421 insertions(+), 8 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 092ee9f..8710418 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1451,6 +1451,7 @@ struct kvm_irq_routing_entry {
struct kvm_irq_routing_irqchip irqchip;
struct kvm_irq_routing_msi msi;
struct kvm_irq_routing_s390_adapter adapter;
+   struct kvm_irq_routing_hv_sint hv_sint;
__u32 pad[8];
} u;
  };
@@ -1459,6 +1460,7 @@ struct kvm_irq_routing_entry {
  #define KVM_IRQ_ROUTING_IRQCHIP 1
  #define KVM_IRQ_ROUTING_MSI 2
  #define KVM_IRQ_ROUTING_S390_ADAPTER 3
+#define KVM_IRQ_ROUTING_HV_SINT 4

  No flags are specified so far, the corresponding field must be set to zero.

@@ -1482,6 +1484,10 @@ struct kvm_irq_routing_s390_adapter {
__u32 adapter_id;
  };

+struct kvm_irq_routing_hv_sint {
+   __u32 vcpu;
+   __u32 sint;
+};

  4.53 KVM_ASSIGN_SET_MSIX_NR (deprecated)

@@ -3685,3 +3691,11 @@ available, means that that the kernel has an 
implementation of the
  H_RANDOM hypercall backed by a hardware random-number generator.
  If present, the kernel H_RANDOM handler can be enabled for guest use
  with the KVM_CAP_PPC_ENABLE_HCALL capability.
+
+8.2 KVM_CAP_HYPERV_SYNIC
+
+Architectures: x86
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel has an implementation of the
+Hyper-V Synthetic interrupt controller(SynIC). SynIC is used to
+support Windows Hyper-V based guest paravirt drivers(VMBus).
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3c6327d..8434f88 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -25,6 +25,7 @@
  #include 
  #include 
  #include 
+#include 

  #include 
  #include 
@@ -374,10 +375,23 @@ struct kvm_mtrr {
struct list_head head;
  };

+/* Hyper-V synthetic interrupt controller (SynIC)*/
+struct kvm_vcpu_hv_synic {
+   u64 version;
+   u64 control;
+   u64 msg_page;
+   u64 evt_page;
+   atomic64_t sint[HV_SYNIC_SINT_COUNT];
+   atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT];
+   DECLARE_BITMAP(auto_eoi_bitmap, 256);
+   DECLARE_BITMAP(vec_bitmap, 256);
+};
+
  /* Hyper-V per vcpu emulation context */
  struct kvm_vcpu_hv {

Re: [PATCH v3 8/9] kvm/x86: Hyper-V synthetic interrupt controller

2015-10-29 Thread Paolo Bonzini

On 29/10/2015 09:45, Roman Kagan wrote:
> While Andrey is testing it, I'd like to ask similar question re. MSI:
> why is there a "shortcut" for KVM_IRQ_ROUTING_MSI case (which we
> basically modelled after) when it would probably get handled through
> ->set handler in irqfd_inject() too?

Because it's a bit faster that way. :)  By avoiding the schedule_work,
you can improve latency by a few microseconds.  It's nice to have it for
the VFIO case especially, where everything you do takes you further from
hardware performance.

However, that shortcut is badly implemented because it lets you do a
walk over all CPUs while interrupts are disabled.  It should be modified
to use kvm_set_msi_inatomic instead of kvm_set_msi (more precisely, I
would like to remove kvm_set_msi and keep kvm_arch_irq_update; then
kvm_arch_irq_update will call kvm_set_msi_inatomic).

I'll post a patch next Monday.  You can then benchmark the addition of
synthetic interrupts to the atomic-context fast path, and see if it
makes a speed difference.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Steal time accounting in KVM. Benchmark.

2015-10-29 Thread Wanpeng Li


Hi Paolo,
On 10/24/15 10:03 AM, Alexey Makhalov wrote:

  What I figured out.
It happens in intersection of 3 features:
*irq time accounting
*stolen time accounting
*linux guest with tickless idle only (not fully tickless)

Looks like timer interrupts storm is happening during this benchmark
(with 2:1 cpu overcommit). irq time accounting gets crazy. Even 'top'
shows weird statistic: 50% hi, 50% st, ~0% user, spinning processes
use ~0% cpu - that is not correct.


If this is the desired behavior or something need to improve?

Regards,
Wanpeng Li
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC Patch 00/12] IXGBE: Add live migration support for SRIOV NIC

2015-10-29 Thread Lan Tianyu

On 2015年10月30日 00:17, Alexander Duyck wrote:
> On 10/29/2015 01:33 AM, Lan Tianyu wrote:
>> On 2015年10月29日 14:58, Alexander Duyck wrote:
>>> Your code was having to do a bunch of shuffling in order to get things
>>> set up so that you could bring the interface back up.  I would argue
>>> that it may actually be faster at least on the bring-up to just drop the
>>> old rings and start over since it greatly reduced the complexity and the
>>> amount of device related data that has to be moved.
>> If give up the old ring after migration and keep DMA running before
>> stopping VCPU, it seems we don't need to track Tx/Rx descriptor ring and
>> just make sure that all Rx buffers delivered to stack has been migrated.
>>
>> 1) Dummy write Rx buffer before checking Rx descriptor to ensure packet
>> migrated first.
> 
> Don't dummy write the Rx descriptor.  You should only really need to
> dummy write the Rx buffer and you would do so after checking the
> descriptor, not before.  Otherwise you risk corrupting the Rx buffer
> because it is possible for you to read the Rx buffer, DMA occurs, and
> then you write back the Rx buffer and now you have corrupted the memory.
> 
>> 2) Make a copy of Rx descriptor and then use the copied data to check
>> buffer status. Not use the original descriptor because it won't be
>> migrated and migration may happen between two access of the Rx
>> descriptor.
> 
> Do not just blindly copy the Rx descriptor ring.  That is a recipe for
> disaster.  The problem is DMA has to happen in a very specific order for
> things to function correctly.  The Rx buffer has to be written and then
> the Rx descriptor.  The problem is you will end up getting a read-ahead
> on the Rx descriptor ring regardless of which order you dirty things in.

Sorry, I didn't say clearly.
I meant to copy one Rx descriptor when receive rx irq and handle Rx ring.

Current code in the ixgbevf_clean_rx_irq() checks status of the Rx
descriptor whether its Rx buffer has been populated data and then read
the packet length from Rx descriptor to handle the Rx buffer.

My idea is to do the following three steps when receive Rx buffer in the
ixgbevf_clean_rx_irq().

(1) dummy write the Rx buffer first,
(2) make a copy of its Rx descriptor
(3) Check the buffer status and get length from the copy.

Migration may happen every time.
Happen between (1) and (2). If the Rx buffer has been populated data, VF
driver will not know that on the new machine because the Rx descriptor
isn't migrated. But it's still safe.

Happen between (2) and (3). The copy will be migrated to new machine
and Rx buffer is migrated firstly. If there is data in the Rx buffer,
VF driver still can handle the buffer without migrating Rx descriptor.

The next buffers will be ignored since we don't migrate Rx descriptor
for them. Their status will be not completed on the new machine.

-- 
Best regards
Tianyu Lan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/6] KVM: PPC: Book3S HV: Send IPI to host core to wake VCPU

2015-10-29 Thread kbuild test robot

Hi Suresh,

[auto build test ERROR on kvm/linux-next -- if it's inappropriate base, please 
suggest rules for selecting the more suitable base]

url:
https://github.com/0day-ci/linux/commits/Suresh-Warrier/KVM-PPC-Book3S-HV-Optimize-wakeup-VCPU-from-H_IPI/20151030-081329
config: powerpc-defconfig (attached as .config)
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

   arch/powerpc/kvm/book3s_hv_rm_xics.c: In function 'icp_rm_set_vcpu_irq':
>> arch/powerpc/kvm/book3s_hv_rm_xics.c:142:4: error: implicit declaration of 
>> function 'smp_muxed_ipi_rm_message_pass' 
>> [-Werror=implicit-function-declaration]
   smp_muxed_ipi_rm_message_pass(hcpu,
   ^
   arch/powerpc/kvm/book3s_hv_rm_xics.c:143:7: error: 'PPC_MSG_RM_HOST_ACTION' 
undeclared (first use in this function)
  PPC_MSG_RM_HOST_ACTION);
  ^
   arch/powerpc/kvm/book3s_hv_rm_xics.c:143:7: note: each undeclared identifier 
is reported only once for each function it appears in
   cc1: all warnings being treated as errors

vim +/smp_muxed_ipi_rm_message_pass +142 arch/powerpc/kvm/book3s_hv_rm_xics.c

   136  hcore = -1;
   137  if (kvmppc_host_rm_ops_hv)
   138  hcore = 
find_available_hostcore(XICS_RM_KICK_VCPU);
   139  if (hcore != -1) {
   140  hcpu = hcore << threads_shift;
   141  kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = 
vcpu;
 > 142  smp_muxed_ipi_rm_message_pass(hcpu,
   143  PPC_MSG_RM_HOST_ACTION);
   144  } else {
   145  this_icp->rm_action |= XICS_RM_KICK_VCPU;

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: Binary data

Re: [PATCH v4 0/6] virtio core DMA API conversion