from:"Adalbert Lazăr"

[PATCH v2] vsock/virtio: fix kernel panic from virtio_transport_reset_no_sock

2019-04-19 Thread Adalbert Lazăr

Previous to commit 22b5c0b63f32 ("vsock/virtio: fix kernel panic
after device hot-unplug"), vsock_core_init() was called from
virtio_vsock_probe(). Now, virtio_transport_reset_no_sock() can be called
before vsock_core_init() has the chance to run.

[Wed Feb 27 14:17:09 2019] BUG: unable to handle kernel NULL pointer 
dereference at 0110
[Wed Feb 27 14:17:09 2019] #PF error: [normal kernel read fault]
[Wed Feb 27 14:17:09 2019] PGD 0 P4D 0
[Wed Feb 27 14:17:09 2019] Oops:  [#1] SMP PTI
[Wed Feb 27 14:17:09 2019] CPU: 3 PID: 59 Comm: kworker/3:1 Not tainted 
5.0.0-rc7-390-generic-hvi #390
[Wed Feb 27 14:17:09 2019] Hardware name: QEMU Standard PC (i440FX + PIIX, 
1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[Wed Feb 27 14:17:09 2019] Workqueue: virtio_vsock virtio_transport_rx_work 
[vmw_vsock_virtio_transport]
[Wed Feb 27 14:17:09 2019] RIP: 0010:virtio_transport_reset_no_sock+0x8c/0xc0 
[vmw_vsock_virtio_transport_common]
[Wed Feb 27 14:17:09 2019] Code: 35 8b 4f 14 48 8b 57 08 31 f6 44 8b 4f 10 44 
8b 07 48 8d 7d c8 e8 84 f8 ff ff 48 85 c0 48 89 c3 74 2a e8 f7 31 03 00 48 89 
df <48> 8b 80 10 01 00 00 e8 68 fb 69 ed 48 8b 75 f0 65 48 33 34 25 28
[Wed Feb 27 14:17:09 2019] RSP: 0018:b42701ab7d40 EFLAGS: 00010282
[Wed Feb 27 14:17:09 2019] RAX:  RBX: 9d79637ee080 RCX: 
0003
[Wed Feb 27 14:17:09 2019] RDX: 0001 RSI: 0002 RDI: 
9d79637ee080
[Wed Feb 27 14:17:09 2019] RBP: b42701ab7d78 R08: 9d796fae70e0 R09: 
9d796f403500
[Wed Feb 27 14:17:09 2019] R10: b42701ab7d90 R11:  R12: 
9d7969d09240
[Wed Feb 27 14:17:09 2019] R13: 9d79624e6840 R14: 9d7969d09318 R15: 
9d796d48ff80
[Wed Feb 27 14:17:09 2019] FS:  () 
GS:9d796fac() knlGS:
[Wed Feb 27 14:17:09 2019] CS:  0010 DS:  ES:  CR0: 80050033
[Wed Feb 27 14:17:09 2019] CR2: 0110 CR3: 000427f22000 CR4: 
06e0
[Wed Feb 27 14:17:09 2019] DR0:  DR1:  DR2: 

[Wed Feb 27 14:17:09 2019] DR3:  DR6: fffe0ff0 DR7: 
0400
[Wed Feb 27 14:17:09 2019] Call Trace:
[Wed Feb 27 14:17:09 2019]  virtio_transport_recv_pkt+0x63/0x820 
[vmw_vsock_virtio_transport_common]
[Wed Feb 27 14:17:09 2019]  ? kfree+0x17e/0x190
[Wed Feb 27 14:17:09 2019]  ? detach_buf_split+0x145/0x160
[Wed Feb 27 14:17:09 2019]  ? __switch_to_asm+0x40/0x70
[Wed Feb 27 14:17:09 2019]  virtio_transport_rx_work+0xa0/0x106 
[vmw_vsock_virtio_transport]
[Wed Feb 27 14:17:09 2019] NET: Registered protocol family 40
[Wed Feb 27 14:17:09 2019]  process_one_work+0x167/0x410
[Wed Feb 27 14:17:09 2019]  worker_thread+0x4d/0x460
[Wed Feb 27 14:17:09 2019]  kthread+0x105/0x140
[Wed Feb 27 14:17:09 2019]  ? rescuer_thread+0x360/0x360
[Wed Feb 27 14:17:09 2019]  ? kthread_destroy_worker+0x50/0x50
[Wed Feb 27 14:17:09 2019]  ret_from_fork+0x35/0x40
[Wed Feb 27 14:17:09 2019] Modules linked in: vmw_vsock_virtio_transport 
vmw_vsock_virtio_transport_common input_leds vsock serio_raw i2c_piix4 mac_hid 
qemu_fw_cfg autofs4 cirrus ttm drm_kms_helper syscopyarea sysfillrect sysimgblt 
fb_sys_fops virtio_net psmouse drm net_failover pata_acpi virtio_blk failover 
floppy

Fixes: 22b5c0b63f32 ("vsock/virtio: fix kernel panic after device hot-unplug")
Reported-by: Alexandru Herghelegiu 
Signed-off-by: Adalbert Lazăr 
Co-developed-by: Stefan Hajnoczi 
---
 net/vmw_vsock/virtio_transport_common.c | 22 +++---
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 3ae3a33da70b..602715fc9a75 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -662,6 +662,8 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
  */
 static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
 {
+   const struct virtio_transport *t;
+   struct virtio_vsock_pkt *reply;
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_RST,
.type = le16_to_cpu(pkt->hdr.type),
@@ -672,15 +674,21 @@ static int virtio_transport_reset_no_sock(struct 
virtio_vsock_pkt *pkt)
if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
return 0;

-   pkt = virtio_transport_alloc_pkt(, 0,
-le64_to_cpu(pkt->hdr.dst_cid),
-le32_to_cpu(pkt->hdr.dst_port),
-le64_to_cpu(pkt->hdr.src_cid),
-le32_to_cpu(pkt->hdr.src_port));
-   if (!pkt)
+   reply = virtio_transport_alloc_pkt(, 0,
+  le64_to_cpu(pkt->hdr.dst_cid),
+

Re: [PATCH] vsock/virtio: fix kernel panic from virtio_transport_reset_no_sock

2019-04-19 Thread Adalbert Lazăr

On Wed, 6 Mar 2019 08:41:04 +, Stefan Hajnoczi  wrote:
> On Tue, Mar 05, 2019 at 08:01:45PM +0200, Adalbert Lazăr wrote:
> 
> Thanks for the patch, Adalbert!  Please add a Signed-off-by tag so your
> patch can be merged (see Documentation/process/submitting-patches.rst
> Chapter 11 for details on the Developer's Certificate of Origin).
> 
> >  static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
> >  {
> > +   const struct virtio_transport *t;
> > struct virtio_vsock_pkt_info info = {
> > .op = VIRTIO_VSOCK_OP_RST,
> > .type = le16_to_cpu(pkt->hdr.type),
> > @@ -680,7 +681,11 @@ static int virtio_transport_reset_no_sock(struct 
> > virtio_vsock_pkt *pkt)
> > if (!pkt)
> > return -ENOMEM;
> >  
> > -   return virtio_transport_get_ops()->send_pkt(pkt);
> > +   t = virtio_transport_get_ops();
> > +   if (!t)
> > +   return -ENOTCONN;
> 
> pkt is leaked here.  This is an easy mistake to make because the code is
> unclear. 

Thank you for your kind words :)

> The pkt argument is the received packet that we must reply to.
> The reply packet is allocated just before line 680 and must be free
> explicitly for return -ENOTCONN.
> 
> You can avoid the leak and make the code easier to read like this:
> 
>   struct virtio_vsock_pkt *reply;
> 
>   ...
> 
>  -- avoid reusing 'pkt'
> v
>   reply = virtio_transport_alloc_pkt(, 0, ...);
>   if (!reply)
>   return -ENOMEM;
> 
>   t = virtio_transport_get_ops();
>   if (!t) {
>   virtio_transport_free_pkt(reply); <-- prevent memory leak
>   return -ENOTCONN;
>   }
>   return t->send_pkt(reply);

What do you think about Stefano's suggestion, to move the check above
the line were the reply is allocated?
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH] vsock/virtio: fix kernel panic from virtio_transport_reset_no_sock

2019-04-19 Thread Adalbert Lazăr

Previous to commit 22b5c0b63f32 ("vsock/virtio: fix kernel panic after device 
hot-unplug"),
vsock_core_init() was called from virtio_vsock_probe(). Now,
virtio_transport_reset_no_sock() can be called before vsock_core_init()
has the chance to run.

[Wed Feb 27 14:17:09 2019] BUG: unable to handle kernel NULL pointer 
dereference at 0110
[Wed Feb 27 14:17:09 2019] #PF error: [normal kernel read fault]
[Wed Feb 27 14:17:09 2019] PGD 0 P4D 0
[Wed Feb 27 14:17:09 2019] Oops:  [#1] SMP PTI
[Wed Feb 27 14:17:09 2019] CPU: 3 PID: 59 Comm: kworker/3:1 Not tainted 
5.0.0-rc7-390-generic-hvi #390
[Wed Feb 27 14:17:09 2019] Hardware name: QEMU Standard PC (i440FX + PIIX, 
1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[Wed Feb 27 14:17:09 2019] Workqueue: virtio_vsock virtio_transport_rx_work 
[vmw_vsock_virtio_transport]
[Wed Feb 27 14:17:09 2019] RIP: 0010:virtio_transport_reset_no_sock+0x8c/0xc0 
[vmw_vsock_virtio_transport_common]
[Wed Feb 27 14:17:09 2019] Code: 35 8b 4f 14 48 8b 57 08 31 f6 44 8b 4f 10 44 
8b 07 48 8d 7d c8 e8 84 f8 ff ff 48 85 c0 48 89 c3 74 2a e8 f7 31 03 00 48 89 
df <48> 8b 80 10 01 00 00 e8 68 fb 69 ed 48 8b 75 f0 65 48 33 34 25 28
[Wed Feb 27 14:17:09 2019] RSP: 0018:b42701ab7d40 EFLAGS: 00010282
[Wed Feb 27 14:17:09 2019] RAX:  RBX: 9d79637ee080 RCX: 
0003
[Wed Feb 27 14:17:09 2019] RDX: 0001 RSI: 0002 RDI: 
9d79637ee080
[Wed Feb 27 14:17:09 2019] RBP: b42701ab7d78 R08: 9d796fae70e0 R09: 
9d796f403500
[Wed Feb 27 14:17:09 2019] R10: b42701ab7d90 R11:  R12: 
9d7969d09240
[Wed Feb 27 14:17:09 2019] R13: 9d79624e6840 R14: 9d7969d09318 R15: 
9d796d48ff80
[Wed Feb 27 14:17:09 2019] FS:  () 
GS:9d796fac() knlGS:
[Wed Feb 27 14:17:09 2019] CS:  0010 DS:  ES:  CR0: 80050033
[Wed Feb 27 14:17:09 2019] CR2: 0110 CR3: 000427f22000 CR4: 
06e0
[Wed Feb 27 14:17:09 2019] DR0:  DR1:  DR2: 

[Wed Feb 27 14:17:09 2019] DR3:  DR6: fffe0ff0 DR7: 
0400
[Wed Feb 27 14:17:09 2019] Call Trace:
[Wed Feb 27 14:17:09 2019]  virtio_transport_recv_pkt+0x63/0x820 
[vmw_vsock_virtio_transport_common]
[Wed Feb 27 14:17:09 2019]  ? kfree+0x17e/0x190
[Wed Feb 27 14:17:09 2019]  ? detach_buf_split+0x145/0x160
[Wed Feb 27 14:17:09 2019]  ? __switch_to_asm+0x40/0x70
[Wed Feb 27 14:17:09 2019]  virtio_transport_rx_work+0xa0/0x106 
[vmw_vsock_virtio_transport]
[Wed Feb 27 14:17:09 2019] NET: Registered protocol family 40
[Wed Feb 27 14:17:09 2019]  process_one_work+0x167/0x410
[Wed Feb 27 14:17:09 2019]  worker_thread+0x4d/0x460
[Wed Feb 27 14:17:09 2019]  kthread+0x105/0x140
[Wed Feb 27 14:17:09 2019]  ? rescuer_thread+0x360/0x360
[Wed Feb 27 14:17:09 2019]  ? kthread_destroy_worker+0x50/0x50
[Wed Feb 27 14:17:09 2019]  ret_from_fork+0x35/0x40
[Wed Feb 27 14:17:09 2019] Modules linked in: vmw_vsock_virtio_transport 
vmw_vsock_virtio_transport_common input_leds vsock serio_raw i2c_piix4 mac_hid 
qemu_fw_cfg autofs4 cirrus ttm drm_kms_helper syscopyarea sysfillrect sysimgblt 
fb_sys_fops virtio_net psmouse drm net_failover pata_acpi virtio_blk failover 
floppy
[Wed Feb 27 14:17:09 2019] CR2: 0110
[Wed Feb 27 14:17:09 2019] ---[ end trace baa35abd2e040fe5 ]---
[Wed Feb 27 14:17:09 2019] RIP: 0010:virtio_transport_reset_no_sock+0x8c/0xc0 
[vmw_vsock_virtio_transport_common]
[Wed Feb 27 14:17:09 2019] Code: 35 8b 4f 14 48 8b 57 08 31 f6 44 8b 4f 10 44 
8b 07 48 8d 7d c8 e8 84 f8 ff ff 48 85 c0 48 89 c3 74 2a e8 f7 31 03 00 48 89 
df <48> 8b 80 10 01 00 00 e8 68 fb 69 ed 48 8b 75 f0 65 48 33 34 25 28
[Wed Feb 27 14:17:09 2019] RSP: 0018:b42701ab7d40 EFLAGS: 00010282
[Wed Feb 27 14:17:09 2019] RAX:  RBX: 9d79637ee080 RCX: 
0003
[Wed Feb 27 14:17:09 2019] RDX: 0001 RSI: 0002 RDI: 
9d79637ee080
[Wed Feb 27 14:17:09 2019] RBP: b42701ab7d78 R08: 9d796fae70e0 R09: 
9d796f403500
[Wed Feb 27 14:17:09 2019] R10: b42701ab7d90 R11:  R12: 
9d7969d09240
[Wed Feb 27 14:17:09 2019] R13: 9d79624e6840 R14: 9d7969d09318 R15: 
9d796d48ff80
[Wed Feb 27 14:17:09 2019] FS:  () 
GS:9d796fac() knlGS:
[Wed Feb 27 14:17:09 2019] CS:  0010 DS:  ES:  CR0: 80050033
[Wed Feb 27 14:17:09 2019] CR2: 0110 CR3: 000427f22000 CR4: 
06e0
[Wed Feb 27 14:17:09 2019] DR0:  DR1:  DR2: 

[Wed Feb 27 14:17:09 2019] DR3:  DR6: fffe0ff0 DR7: 
0400
---
 net/vmw_vsock/virtio_transport_common.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 3ae3a33da70b..502201aaff2a

Re: [PATCH] vsock/virtio: fix kernel panic from virtio_transport_reset_no_sock

2019-04-19 Thread Adalbert Lazăr

On Wed, 6 Mar 2019 09:12:36 +0100, Stefano Garzarella  
wrote:
> > --- a/net/vmw_vsock/virtio_transport_common.c
> > +++ b/net/vmw_vsock/virtio_transport_common.c
> > @@ -662,6 +662,7 @@ static int virtio_transport_reset(struct vsock_sock 
> > *vsk,
> >   */
> >  static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
> >  {
> > +   const struct virtio_transport *t;
> > struct virtio_vsock_pkt_info info = {
> > .op = VIRTIO_VSOCK_OP_RST,
> > .type = le16_to_cpu(pkt->hdr.type),
> > @@ -680,7 +681,11 @@ static int virtio_transport_reset_no_sock(struct 
> > virtio_vsock_pkt *pkt)
> > if (!pkt)
> > return -ENOMEM;
> >  
> > -   return virtio_transport_get_ops()->send_pkt(pkt);
> > +   t = virtio_transport_get_ops();
> > +   if (!t)
> > +   return -ENOTCONN;
> 
> Should be better to do this check before the virtio_transport_alloc_pkt?
> 
> Otherwise, I think we should free that packet before to return -ENOTCONN.

Right! :D
I will send a second version.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vsock/virtio: fix kernel panic from virtio_transport_reset_no_sock

2019-04-19 Thread Adalbert Lazăr

On Wed, 6 Mar 2019 17:02:16 +, Stefan Hajnoczi  wrote:
> On Wed, Mar 06, 2019 at 11:10:41AM +0200, Adalbert Lazăr wrote:
> > On Wed, 6 Mar 2019 08:41:04 +, Stefan Hajnoczi  
> > wrote:
> > > On Tue, Mar 05, 2019 at 08:01:45PM +0200, Adalbert Lazăr wrote:
> > > The pkt argument is the received packet that we must reply to.
> > > The reply packet is allocated just before line 680 and must be free
> > > explicitly for return -ENOTCONN.
> > > 
> > > You can avoid the leak and make the code easier to read like this:
> > > 
> > >   struct virtio_vsock_pkt *reply;
> > > 
> > >   ...
> > > 
> > >  -- avoid reusing 'pkt'
> > > v
> > >   reply = virtio_transport_alloc_pkt(, 0, ...);
> > >   if (!reply)
> > >   return -ENOMEM;
> > > 
> > >   t = virtio_transport_get_ops();
> > >   if (!t) {
> > >   virtio_transport_free_pkt(reply); <-- prevent memory leak
> > >   return -ENOTCONN;
> > >   }
> > >   return t->send_pkt(reply);
> > 
> > What do you think about Stefano's suggestion, to move the check above
> > the line were the reply is allocated?
> 
> That's fine too.
> 
> However a follow up patch to eliminate the confusing way that 'pkt' is
> reused is still warranted.  If you are busy I'd be happy to send that
> cleanup.
> 
> Stefan

I've got it, a couple of minutes after I've replied :)
The second version[1] should be in your mailbox.

Thank you,
Adalbert

[1]: https://patchwork.kernel.org/patch/10840787/
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH v6 75/92] kvm: x86: disable gpa_available optimization in emulator_read_write_onepage()

2019-08-13 Thread Adalbert Lazăr

On Tue, 13 Aug 2019 10:47:34 +0200, Paolo Bonzini  wrote:
> On 09/08/19 18:00, Adalbert Lazăr wrote:
> > If the EPT violation was caused by an execute restriction imposed by the
> > introspection tool, gpa_available will point to the instruction pointer,
> > not the to the read/write location that has to be used to emulate the
> > current instruction.
> > 
> > This optimization should be disabled only when the VM is introspected,
> > not just because the introspection subsystem is present.
> > 
> > Signed-off-by: Adalbert Lazăr 
> 
> The right thing to do is to not set gpa_available for fetch failures in 
> kvm_mmu_page_fault instead:
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 24843cf49579..1bdca40fa831 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -5364,8 +5364,12 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t 
> cr2, u64 error_code,
>   enum emulation_result er;
>   bool direct = vcpu->arch.mmu->direct_map;
>  
> - /* With shadow page tables, fault_address contains a GVA or nGPA.  */
> - if (vcpu->arch.mmu->direct_map) {
> + /*
> +  * With shadow page tables, fault_address contains a GVA or nGPA.
> +  * On a fetch fault, fault_address contains the instruction pointer.
> +  */
> + if (vcpu->arch.mmu->direct_map &&
> + likely(!(error_code & PFERR_FETCH_MASK)) {
>   vcpu->arch.gpa_available = true;
>   vcpu->arch.gpa_val = cr2;
>   }
> 
> 
> Paolo
> 
> > ---
> >  arch/x86/kvm/x86.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 965c4f0108eb..3975331230b9 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -5532,7 +5532,7 @@ static int emulator_read_write_onepage(unsigned long 
> > addr, void *val,
> >  * operation using rep will only have the initial GPA from the NPF
> >  * occurred.
> >  */
> > -   if (vcpu->arch.gpa_available &&
> > +   if (vcpu->arch.gpa_available && !kvmi_is_present() &&
> > emulator_can_use_gpa(ctxt) &&
> > (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
> > gpa = vcpu->arch.gpa_val;
> > 
> 

Sure, but I think we'll have to extend the check.

Searching the logs I've found:

kvm/x86: re-translate broken translation that caused EPT violation

Signed-off-by: Mircea Cirjaliu 

 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

/home/b/kvmi@9cad844~1/arch/x86/kvm/x86.c:4757,4762 - 
/home/b/kvmi@9cad844/arch/x86/kvm/x86.c:4757,4763
 */
if (vcpu->arch.gpa_available &&
emulator_can_use_gpa(ctxt) &&
+   (vcpu->arch.error_code & PFERR_GUEST_FINAL_MASK) &&
(addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
gpa = vcpu->arch.gpa_val;
ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH v6 70/92] kvm: x86: filter out access rights only when tracked by the introspection tool

2019-08-13 Thread Adalbert Lazăr

On Tue, 13 Aug 2019 11:08:39 +0200, Paolo Bonzini  wrote:
> On 09/08/19 18:00, Adalbert Lazăr wrote:
> > It should complete the commit fd34a9518173 ("kvm: x86: consult the page 
> > tracking from kvm_mmu_get_page() and __direct_map()")
> > 
> > Signed-off-by: Adalbert Lazăr 
> > ---
> >  arch/x86/kvm/mmu.c | 3 +++
> >  1 file changed, 3 insertions(+)
> > 
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 65b6acba82da..fd64cf1115da 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -2660,6 +2660,9 @@ static void clear_sp_write_flooding_count(u64 *spte)
> >  static unsigned int kvm_mmu_page_track_acc(struct kvm_vcpu *vcpu, gfn_t 
> > gfn,
> >unsigned int acc)
> >  {
> > +   if (!kvmi_tracked_gfn(vcpu, gfn))
> > +   return acc;
> > +
> > if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_PREREAD))
> > acc &= ~ACC_USER_MASK;
> > if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_PREWRITE) ||
> > 
> 
> If this patch is always needed, then the function should be named
> something like kvm_mmu_apply_introspection_access and kvmi_tracked_gfn
> should be tested from the moment it is introduced.
> 
> But the commit message says nothing about _why_ it is needed, so I
> cannot guess.  I would very much avoid it however.  Is it just an
> optimization?
> 
> Paolo

We'll retest to see if we still need kvm_mmu_page_track_acc().
The kvmi_tracked_gfn() check was used to keep the KVM code flow
"unchanged" as much as possible. Probably, we can get ride of it.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH v6 16/92] kvm: introspection: handle events and event replies

2019-08-13 Thread Adalbert Lazăr

On Tue, 13 Aug 2019 10:55:21 +0200, Paolo Bonzini  wrote:
> On 09/08/19 17:59, Adalbert Lazăr wrote:
> > 
> > +reply->padding2);
> > +
> > +   ivcpu->reply_waiting = false;
> > +   return expected->error;
> > +}
> > +
> >  /*
> 
> Is this missing a wakeup?
> 
> >  
> > +static bool need_to_wait(struct kvm_vcpu *vcpu)
> > +{
> > +   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
> > +
> > +   return ivcpu->reply_waiting;
> > +}
> > +
> 
> Do you actually need this function?  It seems to me that everywhere you
> call it you already have an ivcpu, so you can just access the field.
> 
> Also, "reply_waiting" means "there is a reply that is waiting".  What
> you mean is "waiting_for_reply".

In an older version, handle_event_reply() was executed from the receiving
thread (having another name) and it contained a wakeup function. Now,
indeed, 'waiting_for_reply' is the right name.
 
> The overall structure of the jobs code is confusing.  The same function
> kvm_run_jobs_and_wait is an infinite loop before and gets a "break"
> later.  It is also not clear why kvmi_job_wait is called through a job.
>  Can you have instead just kvm_run_jobs in KVM_REQ_INTROSPECTION, and
> something like this instead when sending an event:
> 
> int kvmi_wait_for_reply(struct kvm_vcpu *vcpu)
> {
>   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
> 
>   while (ivcpu->waiting_for_reply) {
>   kvmi_run_jobs(vcpu);
> 
>   err = swait_event_killable(*wq,
>   !ivcpu->waiting_for_reply ||
>   !list_empty(>job_list));
> 
>   if (err)
>   return -EINTR;
>   }
> 
>   return 0;
> }
> 
> ?
> 
> Paolo

Much better :) Thank you.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH v6 06/92] kvm: introspection: add KVMI_CONTROL_CMD_RESPONSE

2019-08-13 Thread Adalbert Lazăr

On Tue, 13 Aug 2019 11:15:34 +0200, Paolo Bonzini  wrote:
> On 09/08/19 17:59, Adalbert Lazăr wrote:
> > +If `now` is 1, the command reply is enabled/disabled (according to
> > +`enable`) starting with the current command. For example, `enable=0`
> > +and `now=1` means that the reply is disabled for this command too,
> > +while `enable=0` and `now=0` means that a reply will be send for this
> > +command, but not for the next ones (until enabled back with another
> > +*KVMI_CONTROL_CMD_RESPONSE*).
> > +
> > +This command is used by the introspection tool to disable the replies
> > +for commands returning an error code only (eg. *KVMI_SET_REGISTERS*)
> > +when an error is less likely to happen. For example, the following
> > +commands can be used to reply to an event with a single `write()` call:
> > +
> > +   KVMI_CONTROL_CMD_RESPONSE enable=0 now=1
> > +   KVMI_SET_REGISTERS vcpu=N
> > +   KVMI_EVENT_REPLY   vcpu=N
> > +   KVMI_CONTROL_CMD_RESPONSE enable=1 now=0
> 
> I don't understand the usage.  Is there any case where you want now == 1
> actually?  Can you just say that KVMI_CONTROL_CMD_RESPONSE never has a
> reply, or to make now==enable?

The enable=1 now=1 is for pause VM:

KVMI_CONTROL_CMD_RESPONSE enable=0 now=1
KVMI_PAUSE_VCPU 0
KVMI_PAUSE_VCPU 1
...
KVMI_CONTROL_CMD_RESPONSE enable=1 now=1

We wait for a reply to make sure the vCPUs were stopped without waiting
for their pause events.

We can get around from userspace, if you like:

KVMI_CONTROL_CMD_RESPONSE enable=0 now=1
KVMI_PAUSE_VCPU 0
KVMI_PAUSE_VCPU 1
...
KVMI_PAUSE_VCPU N-2
KVMI_CONTROL_CMD_RESPONSE enable=1 now=0
KVMI_PAUSE_VCPU N-1

> 
> > +   if (err)
> > +   kvmi_warn(ikvm, "Error code %d discarded for message id %d\n",
> > + err, msg->id);
> > +
> 
> Would it make sense to even close the socket if there is an error?
> 
> Paolo

Sure.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH v6 02/92] kvm: introspection: add basic ioctls (hook/unhook)

2019-08-13 Thread Adalbert Lazăr

We'll do.

On Tue, 13 Aug 2019 10:44:28 +0200, Paolo Bonzini  wrote:
> On 09/08/19 17:59, Adalbert Lazăr wrote:
> > +static int kvmi_recv(void *arg)
> > +{
> > +   struct kvmi *ikvm = arg;
> > +
> > +   kvmi_info(ikvm, "Hooking VM\n");
> > +
> > +   while (kvmi_msg_process(ikvm))
> > +   ;
> > +
> > +   kvmi_info(ikvm, "Unhooking VM\n");
> > +
> > +   kvmi_end_introspection(ikvm);
> > +
> > +   return 0;
> > +}
> > +
> 
> Rename this to kvmi_recv_thread instead, please.
> 
> > +
> > +   /*
> > +* Make sure all the KVM/KVMI structures are linked and no pointer
> > +* is read as NULL after the reference count has been set.
> > +*/
> > +   smp_mb__before_atomic();
> 
> This is an smp_wmb(), not an smp_mb__before_atomic().  Add a comment
> that it pairs with the refcount_inc_not_zero in kvmi_get.
> 
> > +   refcount_set(>kvmi_ref, 1);
> > +
> 
> 
> > @@ -57,8 +183,27 @@ void kvmi_destroy_vm(struct kvm *kvm)
> > if (!ikvm)
> > return;
> >  
> > +   /* trigger socket shutdown - kvmi_recv() will start shutdown process */
> > +   kvmi_sock_shutdown(ikvm);
> > +
> > kvmi_put(kvm);
> >  
> > /* wait for introspection resources to be released */
> > wait_for_completion_killable(>kvmi_completed);
> >  }
> > +
> 
> This addition means that kvmi_destroy_vm should have called
> kvmi_end_introspection instead.  In patch 1, kvmi_end_introspection
> should have been just kvmi_put, now this patch can add kvmi_sock_shutdown.
> 
> Paolo
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH v6 74/92] kvm: x86: do not unconditionally patch the hypercall instruction during emulation

2019-08-14 Thread Adalbert Lazăr

On Tue, 13 Aug 2019 11:20:45 +0200, Paolo Bonzini  wrote:
> On 09/08/19 18:00, Adalbert Lazăr wrote:
> > From: Mihai Donțu 
> > 
> > It can happened for us to end up emulating the VMCALL instruction as a
> > result of the handling of an EPT write fault. In this situation, the
> > emulator will try to unconditionally patch the correct hypercall opcode
> > bytes using emulator_write_emulated(). However, this last call uses the
> > fault GPA (if available) or walks the guest page tables at RIP,
> > otherwise. The trouble begins when using KVMI, when we forbid the use of
> > the fault GPA and fallback to the guest pt walk: in Windows (8.1 and
> > newer) the page that we try to write into is marked read-execute and as
> > such emulator_write_emulated() fails and we inject a write #PF, leading
> > to a guest crash.
> > 
> > The fix is rather simple: check the existing instruction bytes before
> > doing the patching. This does not change the normal KVM behaviour, but
> > does help when using KVMI as we no longer inject a write #PF.
> 
> Fixing the hypercall is just an optimization.  Can we just hush and
> return to the guest if emulator_write_emulated returns
> X86EMUL_PROPAGATE_FAULT?
> 
> Paolo

Something like this?

err = emulator_write_emulated(...);
if (err == X86EMUL_PROPAGATE_FAULT)
err = X86EMUL_CONTINUE;
return err;

> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 04b1d2916a0a..965c4f0108eb 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -7363,16 +7363,33 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
> >  
> > +#define KVM_HYPERCALL_INSN_LEN 3
> > +
> >  static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
> >  {
> > +   int err;
> > struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
> > -   char instruction[3];
> > +   char buf[KVM_HYPERCALL_INSN_LEN];
> > +   char instruction[KVM_HYPERCALL_INSN_LEN];
> > unsigned long rip = kvm_rip_read(vcpu);
> >  
> > +   err = emulator_read_emulated(ctxt, rip, buf, sizeof(buf),
> > +>exception);
> > +   if (err != X86EMUL_CONTINUE)
> > +   return err;
> > +
> > kvm_x86_ops->patch_hypercall(vcpu, instruction);
> > +   if (!memcmp(instruction, buf, sizeof(instruction)))
> > +   /*
> > +* The hypercall instruction is the correct one. Retry
> > +* its execution maybe we got here as a result of an
> > +* event other than #UD which has been resolved in the
> > +* mean time.
> > +*/
> > +   return X86EMUL_CONTINUE;
> >  
> > -   return emulator_write_emulated(ctxt, rip, instruction, 3,
> > -   >exception);
> > +   return emulator_write_emulated(ctxt, rip, instruction,
> > +  sizeof(instruction), >exception);
> >  }
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH v6 14/92] kvm: introspection: handle introspection commands before returning to guest

2019-08-14 Thread Adalbert Lazăr

On Tue, 13 Aug 2019 16:45:11 +0200, Paolo Bonzini  wrote:
> On 13/08/19 15:54, Adalbert Lazăr wrote:
> > Leaving kvm_vcpu_block() in order to handle a request such as 'pause',
> > would cause the vCPU to enter the guest when resumed. Most of the
> > time this does not appear to be an issue, but during early boot it
> > can happen for a non-boot vCPU to start executing code from areas that
> > first needed to be set up by vCPU #0.
> > 
> > In a particular case, vCPU #1 executed code which resided in an area
> > not covered by a memslot, which caused an EPT violation that got
> > turned in mmu_set_spte() into a MMIO request that required emulation.
> > Unfortunatelly, the emulator tripped, exited to userspace and the VM
> > was aborted.
> 
> Okay, this makes sense.  Maybe you want to handle KVM_REQ_INTROSPECTION
> in vcpu_run rather than vcpu_enter_guest?
> 
> Paolo

Right! We've missed that.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH v6 01/92] kvm: introduce KVMI (VM introspection subsystem)

2019-08-14 Thread Adalbert Lazăr

On Tue, 13 Aug 2019 08:01:28 -0700, Sean Christopherson 
 wrote:
> On Tue, Aug 13, 2019 at 02:09:51PM +0200, Paolo Bonzini wrote:
> > On 13/08/19 13:57, Adalbert Lazăr wrote:
> > >> The refcounting approach seems a bit backwards, and AFAICT is driven by
> > >> implementing unhook via a message, which also seems backwards.  I assume
> > >> hook and unhook are relatively rare events and not performance critical,
> > >> so make those the restricted/slow flows, e.g. force userspace to quiesce
> > >> the VM by making unhook() mutually exclusive with every vcpu ioctl() and
> > >> maybe anything that takes kvm->lock. 
> > >>
> > >> Then kvmi_ioctl_unhook() can use thread_stop() and kvmi_recv() just needs
> > >> to check kthread_should_stop().
> > >>
> > >> That way kvmi doesn't need to be refcounted since it's guaranteed to be
> > >> alive if the pointer is non-null.  Eliminating the refcounting will clean
> > >> up a lot of the code by eliminating calls to kvmi_{get,put}(), e.g.
> > >> wrappers like kvmi_breakpoint_event() just check vcpu->kvmi, or maybe
> > >> even get dropped altogether.
> > > 
> > > The unhook event has been added to cover the following case: while the
> > > introspection tool runs in another VM, both VMs, the virtual appliance
> > > and the introspected VM, could be paused by the user. We needed a way
> > > to signal this to the introspection tool and give it time to unhook
> > > (the introspected VM has to run and execute the introspection commands
> > > during this phase). The receiving threads quits when the socket is closed
> > > (by QEMU or by the introspection tool).
> 
> Why does closing the socket require destroying the kvmi object?  E.g. can
> it be marked as defunct or whatever and only fully removed on a synchronous
> unhook from userspace?  Re-hooking could either require said unhook, or
> maybe reuse the existing kvmi object with a new socket.

Will it be better to have the following ioctls?

  - hook (alloc kvmi and kvmi_vcpu structs)
  - notify_imminent_unhook (send the KVMI_EVENT_UNHOOK event)
  - unhook (free kvmi and kvmi_vcpu structs)
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: DANGER WILL ROBINSON, DANGER

2019-08-13 Thread Adalbert Lazăr

On Fri, 9 Aug 2019 09:24:44 -0700, Matthew Wilcox  wrote:
> On Fri, Aug 09, 2019 at 07:00:26PM +0300, Adalbert Lazăr wrote:
> > +++ b/include/linux/page-flags.h
> > @@ -417,8 +417,10 @@ PAGEFLAG(Idle, idle, PF_ANY)
> >   */
> >  #define PAGE_MAPPING_ANON  0x1
> >  #define PAGE_MAPPING_MOVABLE   0x2
> > +#define PAGE_MAPPING_REMOTE0x4
> 
> Uh.  How do you know page->mapping would otherwise have bit 2 clear?
> Who's guaranteeing that?
> 
> This is an awfully big patch to the memory management code, buried in
> the middle of a gigantic series which almost guarantees nobody would
> look at it.  I call shenanigans.
> 
> > @@ -1021,7 +1022,7 @@ void page_move_anon_rmap(struct page *page, struct 
> > vm_area_struct *vma)
> >   * __page_set_anon_rmap - set up new anonymous rmap
> >   * @page:  Page or Hugepage to add to rmap
> >   * @vma:   VM area to add page to.
> > - * @address:   User virtual address of the mapping 
> > + * @address:   User virtual address of the mapping
> 
> And mixing in fluff changes like this is a real no-no.  Try again.
> 

No bad intentions, just overzealous.
I didn't want to hide anything from our patches.
Once we advance with the introspection patches related to KVM we'll be
back with the remote mapping patch, split and cleaned.

Thanks
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH v6 14/92] kvm: introspection: handle introspection commands before returning to guest

2019-08-13 Thread Adalbert Lazăr

On Tue, 13 Aug 2019 10:26:29 +0200, Paolo Bonzini  wrote:
> On 09/08/19 17:59, Adalbert Lazăr wrote:
> > +   prepare_to_swait_exclusive(>wq, ,
> > +  TASK_INTERRUPTIBLE);
> > +
> > +   if (kvm_vcpu_check_block(vcpu) < 0)
> > +   break;
> > +
> > +   waited = true;
> > +   schedule();
> > +
> > +   if (kvm_check_request(KVM_REQ_INTROSPECTION, vcpu)) {
> > +   do_kvmi_work = true;
> > +   break;
> > +   }
> > +   }
> >  
> > -   waited = true;
> > -   schedule();
> > +   finish_swait(>wq, );
> > +
> > +   if (do_kvmi_work)
> > +   kvmi_handle_requests(vcpu);
> > +   else
> > +   break;
> > }
> 
> Is this needed?  Or can it just go back to KVM_RUN and handle
> KVM_REQ_INTROSPECTION there (in which case it would be basically
> premature optimization)?
> 

It might still be needed, unless we can get back to this function.

The original commit message for this change was this:

kvm: do not abort kvm_vcpu_block() in order to handle KVMI requests

Leaving kvm_vcpu_block() in order to handle a request such as 'pause',
would cause the vCPU to enter the guest when resumed. Most of the
time this does not appear to be an issue, but during early boot it
can happen for a non-boot vCPU to start executing code from areas that
first needed to be set up by vCPU #0.

In a particular case, vCPU #1 executed code which resided in an area
not covered by a memslot, which caused an EPT violation that got
turned in mmu_set_spte() into a MMIO request that required emulation.
Unfortunatelly, the emulator tripped, exited to userspace and the VM
was aborted.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH v6 13/92] kvm: introspection: make the vCPU wait even when its jobs list is empty

2019-08-13 Thread Adalbert Lazăr

On Tue, 13 Aug 2019 10:43:52 +0200, Paolo Bonzini  wrote:
> On 09/08/19 17:59, Adalbert Lazăr wrote:
> > +void kvmi_handle_requests(struct kvm_vcpu *vcpu)
> > +{
> > +   struct kvmi *ikvm;
> > +
> > +   ikvm = kvmi_get(vcpu->kvm);
> > +   if (!ikvm)
> > +   return;
> > +
> > +   for (;;) {
> > +   int err = kvmi_run_jobs_and_wait(vcpu);
> > +
> > +   if (err)
> > +   break;
> > +   }
> > +
> > +   kvmi_put(vcpu->kvm);
> > +}
> > +
> 
> Using kvmi_run_jobs_and_wait from two places (here and kvmi_send_event)
> is very confusing.  Does kvmi_handle_requests need to do this, or can it
> just use kvmi_run_jobs?

I think I've added this wait to block vCPUs during single-step.
A 'wait_until_single_step_finished' job will do, I guess, so we could
use kvmi_run_jobs() here.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH v6 64/92] kvm: introspection: add single-stepping

2019-08-13 Thread Adalbert Lazăr

On Mon, 12 Aug 2019 13:50:39 -0700, Sean Christopherson 
 wrote:
> On Fri, Aug 09, 2019 at 07:00:19PM +0300, Adalbert Lazăr wrote:
> > From: Nicușor Cîțu 
> > 
> > This would be used either if the introspection tool request it as a
> > reply to a KVMI_EVENT_PF event or to cope with instructions that cannot
> > be handled by the x86 emulator during the handling of a VMEXIT. In
> > these situations, all other vCPU-s are kicked and held, the EPT-based
> > protection is removed and the guest is single stepped by the vCPU that
> > triggered the initial VMEXIT. Upon completion the EPT-base protection
> > is reinstalled and all vCPU-s all allowed to return to the guest.
> > 
> > This is a rather slow workaround that kicks in occasionally. In the
> > future, the most frequently single-stepped instructions should be added
> > to the emulator (usually, stores to and from memory - SSE/AVX).
> > 
> > For the moment it works only on Intel.
> > 
> > CC: Jim Mattson 
> > CC: Sean Christopherson 
> > CC: Joerg Roedel 
> > Signed-off-by: Nicușor Cîțu 
> > Co-developed-by: Mihai Donțu 
> > Signed-off-by: Mihai Donțu 
> > Co-developed-by: Adalbert Lazăr 
> > Signed-off-by: Adalbert Lazăr 
> > ---
> >  arch/x86/include/asm/kvm_host.h |   3 +
> >  arch/x86/kvm/kvmi.c |  47 ++-
> >  arch/x86/kvm/svm.c  |   5 ++
> >  arch/x86/kvm/vmx/vmx.c  |  17 
> >  arch/x86/kvm/x86.c  |  19 +
> >  include/linux/kvmi.h|   4 +
> >  virt/kvm/kvmi.c | 145 +++-
> >  virt/kvm/kvmi_int.h |  16 
> >  8 files changed, 253 insertions(+), 3 deletions(-)
> > 

[...] We'll do.

> > diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h
> > index d7f9858d3e97..1550fe33ed48 100644
> > --- a/virt/kvm/kvmi_int.h
> > +++ b/virt/kvm/kvmi_int.h
> > @@ -126,6 +126,9 @@ struct kvmi_vcpu {
> > DECLARE_BITMAP(high, KVMI_NUM_MSR);
> > } msr_mask;
> >  
> > +   bool ss_owner;
> 
> Why is single-stepping mutually exclusive across all vCPUs?  Does that
> always have to be the case?

I never thought to single-step multiple vCPUs in the same time.

If one vCPU will relax the access to a guest page while a second one,
finishing single-stepping, restores the 'r--' flags, the first one
will get another page fault and relax the page access again. It might
be doable, but before starting single-stepping a vCPU we might replace
guest memory (as requested by the introspection tool) and we will have
to use a lock for this.

However, we would like to use alternate EPT views with single-step.
So, we might replace this patch.

> > +   bool ss_requested;
> > +
> > struct list_head job_list;
> > spinlock_t job_lock;
> >  
> > @@ -151,6 +154,15 @@ struct kvmi {
> > DECLARE_BITMAP(event_allow_mask, KVMI_NUM_EVENTS);
> > DECLARE_BITMAP(vm_ev_mask, KVMI_NUM_EVENTS);
> >  
> > +#define SINGLE_STEP_MAX_DEPTH 8
> > +   struct {
> > +   gfn_t gfn;
> > +   u8 old_access;
> > +   u32 old_write_bitmap;
> > +   } ss_context[SINGLE_STEP_MAX_DEPTH];
> > +   u8 ss_level;
> > +   atomic_t ss_active;
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH v6 01/92] kvm: introduce KVMI (VM introspection subsystem)

2019-08-13 Thread Adalbert Lazăr

On Mon, 12 Aug 2019 13:20:30 -0700, Sean Christopherson 
 wrote:
> On Fri, Aug 09, 2019 at 06:59:16PM +0300, Adalbert Lazăr wrote:
> > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> > index 72fa955f4a15..f70a6a1b6814 100644
> > --- a/arch/x86/kvm/Kconfig
> > +++ b/arch/x86/kvm/Kconfig
> > @@ -96,6 +96,13 @@ config KVM_MMU_AUDIT
> >  This option adds a R/W kVM module parameter 'mmu_audit', which allows
> >  auditing of KVM MMU events at runtime.
> >  
> > +config KVM_INTROSPECTION
> > +   bool "VM Introspection"
> > +   depends on KVM && (KVM_INTEL || KVM_AMD)
> > +   help
> > +This option enables functions to control the execution of VM-s, query
> > +the state of the vCPU-s (GPR-s, MSR-s etc.).
> 
> This does a lot more than enable functions, it allows userspace to do all
> of these things *while the VM is running*.  Everything above can already
> be done by userspace.

First of all, thanks for helping us with this patch series.

Do you mean something like this?

This option enables an introspection app to control any running
VM if userspace/QEMU allows it.

> 
> The "-s" syntax is difficult to read and unnecessary, e.g. at first I
> thought VM-s was referring to a new subsystem or feature introduced by
> introspection.  VMs, vCPUs, GPRs, MSRs, etc...
> 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index c38cc5eb7e73..582b0187f5a4 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -455,6 +455,10 @@ struct kvm {
> > struct srcu_struct srcu;
> > struct srcu_struct irq_srcu;
> > pid_t userspace_pid;
> > +
> > +   struct completion kvmi_completed;
> > +   refcount_t kvmi_ref;
> 
> The refcounting approach seems a bit backwards, and AFAICT is driven by
> implementing unhook via a message, which also seems backwards.  I assume
> hook and unhook are relatively rare events and not performance critical,
> so make those the restricted/slow flows, e.g. force userspace to quiesce
> the VM by making unhook() mutually exclusive with every vcpu ioctl() and
> maybe anything that takes kvm->lock. 
> 
> Then kvmi_ioctl_unhook() can use thread_stop() and kvmi_recv() just needs
> to check kthread_should_stop().
> 
> That way kvmi doesn't need to be refcounted since it's guaranteed to be
> alive if the pointer is non-null.  Eliminating the refcounting will clean
> up a lot of the code by eliminating calls to kvmi_{get,put}(), e.g.
> wrappers like kvmi_breakpoint_event() just check vcpu->kvmi, or maybe
> even get dropped altogether.

The unhook event has been added to cover the following case: while the
introspection tool runs in another VM, both VMs, the virtual appliance
and the introspected VM, could be paused by the user. We needed a way
to signal this to the introspection tool and give it time to unhook
(the introspected VM has to run and execute the introspection commands
during this phase). The receiving threads quits when the socket is closed
(by QEMU or by the introspection tool).

It's a bit unclear how, but we'll try to get ride of the refcount object,
which will remove a lot of code, indeed.

> 
> > +   void *kvmi;
> 
> Why is this a void*?  Just forward declare struct kvmi in kvmi.h.
> 
> IMO this should be 'struct kvm_introspection *introspection', similar to
> 'struct kvm_vcpu_arch arch' and 'struct kvm_vmx'.  Ditto for the vCPU
> flavor.  Local variables could be kvmi+vcpui, kvm_i+vcpu_i, or maybe
> a more long form if someone can come up with a good abbreviation?
> 
> Using 'ikvm' as the local variable name when everything else refers to
> introspection as 'kvmi' is especially funky.

We'll do.

> 
> >  };
> >  
> >  #define kvm_err(fmt, ...) \
> > diff --git a/include/linux/kvmi.h b/include/linux/kvmi.h
> > new file mode 100644
> > index ..e36de3f9f3de
> > --- /dev/null
> > +++ b/include/linux/kvmi.h
> > @@ -0,0 +1,23 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef __KVMI_H__
> > +#define __KVMI_H__
> > +
> > +#define kvmi_is_present() IS_ENABLED(CONFIG_KVM_INTROSPECTION)
> 
> Peeking forward a few patches, introspection should have a module param.

Like kvm.introspection=true/False ?

> The code is also inconsistent in its usage of kvmi_is_present() versus
> #ifdef CONFIG_KVM_INTROSPECTION.
> 
> And maybe kvm_is_instrospection_enabled() so that the gating function has
> a more descriptive name for first-time readers?

Right.

> > diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
> > new file mode 100644
> > index 000

Re: [RFC PATCH v6 69/92] kvm: x86: keep the page protected if tracked by the introspection tool

2019-09-10 Thread Adalbert Lazăr

On Tue, 10 Sep 2019 10:26:42 -0400, Konrad Rzeszutek Wilk 
 wrote:
> On Fri, Aug 09, 2019 at 07:00:24PM +0300, Adalbert Lazăr wrote:
> > This patch might be obsolete thanks to single-stepping.
> 
> sooo should it be skipped from this large patchset to easy
> review?

I'll add a couple of warning messages to check if this patch is still
needed, in order to skip it from the next submission (which will be smaller:)

However, on AMD, single-stepping is not an option.

Thanks,
Adalbert

> 
> > 
> > Signed-off-by: Adalbert Lazăr 
> > ---
> >  arch/x86/kvm/x86.c | 9 +++--
> >  1 file changed, 7 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 2c06de73a784..06f44ce8ed07 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -6311,7 +6311,8 @@ static bool reexecute_instruction(struct kvm_vcpu 
> > *vcpu, gva_t cr2,
> > indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
> > spin_unlock(>kvm->mmu_lock);
> >  
> > -   if (indirect_shadow_pages)
> > +   if (indirect_shadow_pages
> > +   && !kvmi_tracked_gfn(vcpu, gpa_to_gfn(gpa)))
> > kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
> >  
> > return true;
> > @@ -6322,7 +6323,8 @@ static bool reexecute_instruction(struct kvm_vcpu 
> > *vcpu, gva_t cr2,
> >  * and it failed try to unshadow page and re-enter the
> >  * guest to let CPU execute the instruction.
> >  */
> > -   kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
> > +   if (!kvmi_tracked_gfn(vcpu, gpa_to_gfn(gpa)))
> > +   kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
> >  
> > /*
> >  * If the access faults on its page table, it can not
> > @@ -6374,6 +6376,9 @@ static bool retry_instruction(struct x86_emulate_ctxt 
> > *ctxt,
> > if (!vcpu->arch.mmu->direct_map)
> > gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
> >  
> > +   if (kvmi_tracked_gfn(vcpu, gpa_to_gfn(gpa)))
> > +   return false;
> > +
> > kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
> >  
> > return true;
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 00/92] VM introspection

2019-08-09 Thread Adalbert Lazăr

-stepping, exception
injection and remote memory mapping
  - the guests are much more stable (on pair with our introspection
products using Xen)
  - speed improvements (the penalty on web browsing actions is 50% lower,
at least)


Adalbert Lazăr (25):
  kvm: introspection: add basic ioctls (hook/unhook)
  kvm: introspection: add permission access ioctls
  kvm: introspection: add the read/dispatch message function
  kvm: introspection: add KVMI_GET_VERSION
  kvm: introspection: add KVMI_CONTROL_CMD_RESPONSE
  kvm: introspection: honor the reply option when handling the
KVMI_GET_VERSION command
  kvm: introspection: add KVMI_CHECK_COMMAND and KVMI_CHECK_EVENT
  kvm: introspection: add KVMI_CONTROL_VM_EVENTS
  kvm: introspection: add a jobs list to every introspected vCPU
  kvm: introspection: make the vCPU wait even when its jobs list is
empty
  kvm: introspection: add KVMI_EVENT_UNHOOK
  kvm: x86: intercept the write access on sidt and other emulated
instructions
  kvm: introspection: add KVMI_CONTROL_SPP
  kvm: introspection: extend the internal database of tracked pages with
write_bitmap info
  kvm: introspection: add KVMI_GET_PAGE_WRITE_BITMAP
  kvm: introspection: add KVMI_SET_PAGE_WRITE_BITMAP
  kvm: add kvm_vcpu_kick_and_wait()
  kvm: introspection: add KVMI_PAUSE_VCPU and KVMI_EVENT_PAUSE_VCPU
  kvm: x86: add kvm_arch_vcpu_set_guest_debug()
  kvm: introspection: add custom input when single-stepping a vCPU
  kvm: x86: keep the page protected if tracked by the introspection tool
  kvm: x86: filter out access rights only when tracked by the
introspection tool
  kvm: x86: disable gpa_available optimization in
emulator_read_write_onepage()
  kvm: x86: disable EPT A/D bits if introspection is present
  kvm: introspection: add trace functions

Marian Rotariu (1):
  kvm: introspection: add KVMI_GET_CPUID

Mihai Donțu (47):
  kvm: introduce KVMI (VM introspection subsystem)
  kvm: introspection: add KVMI_GET_GUEST_INFO
  kvm: introspection: handle introspection commands before returning to
guest
  kvm: introspection: handle vCPU related introspection commands
  kvm: introspection: handle events and event replies
  kvm: introspection: introduce event actions
  kvm: introspection: add KVMI_GET_VCPU_INFO
  kvm: page track: add track_create_slot() callback
  kvm: x86: provide all page tracking hooks with the guest virtual
address
  kvm: page track: add support for preread, prewrite and preexec
  kvm: x86: wire in the preread/prewrite/preexec page trackers
  kvm: x86: add kvm_mmu_nested_pagefault()
  kvm: introspection: use page track
  kvm: x86: consult the page tracking from kvm_mmu_get_page() and
__direct_map()
  kvm: introspection: add KVMI_CONTROL_EVENTS
  kvm: x86: add kvm_spt_fault()
  kvm: introspection: add KVMI_EVENT_PF
  kvm: introspection: add KVMI_GET_PAGE_ACCESS
  kvm: introspection: add KVMI_SET_PAGE_ACCESS
  kvm: introspection: add KVMI_READ_PHYSICAL and KVMI_WRITE_PHYSICAL
  kvm: introspection: add KVMI_GET_REGISTERS
  kvm: introspection: add KVMI_SET_REGISTERS
  kvm: introspection: add KVMI_INJECT_EXCEPTION + KVMI_EVENT_TRAP
  kvm: introspection: add KVMI_CONTROL_CR and KVMI_EVENT_CR
  kvm: introspection: add KVMI_CONTROL_MSR and KVMI_EVENT_MSR
  kvm: introspection: add KVMI_GET_XSAVE
  kvm: introspection: add KVMI_GET_MTRR_TYPE
  kvm: introspection: add KVMI_EVENT_XSETBV
  kvm: introspection: add KVMI_EVENT_BREAKPOINT
  kvm: introspection: add KVMI_EVENT_HYPERCALL
  kvm: introspection: use single stepping on unimplemented instructions
  kvm: x86: emulate a guest page table walk on SPT violations due to A/D
bit updates
  kvm: x86: do not unconditionally patch the hypercall instruction
during emulation
  kvm: x86: emulate movsd xmm, m64
  kvm: x86: emulate movss xmm, m32
  kvm: x86: emulate movq xmm, m64
  kvm: x86: emulate movq r, xmm
  kvm: x86: emulate movd xmm, m32
  kvm: x86: enable the half part of movss, movsd, movups
  kvm: x86: emulate lfence
  kvm: x86: emulate xorpd xmm2/m128, xmm1
  kvm: x86: emulate xorps xmm/m128, xmm
  kvm: x86: emulate fst/fstp m64fp
  kvm: x86: make lock cmpxchg r, r/m atomic
  kvm: x86: emulate lock cmpxchg8b atomically
  kvm: x86: emulate lock cmpxchg16b m128
  kvm: x86: fallback to the single-step on multipage CMPXCHG emulation

Mircea Cîrjaliu (5):
  kvm: introspection: add vCPU related data
  kvm: introspection: add KVMI_EVENT_CREATE_VCPU
  mm: add support for remote mapping
  kvm: introspection: add memory map/unmap support on the guest side
  kvm: introspection: use remote mapping

Nicușor Cîțu (5):
  kvm: x86: block any attempt to disable MSR interception if tracked by
introspection
  kvm: introspection: add KVMI_EVENT_DESCRIPTOR
  kvm: introspection: add single-stepping
  kvm: introspection: add KVMI_EVENT_SINGLESTEP
  kvm: x86: add tracepoints for interrupt and exception injections

Yang Weijiang (9):
  Documentation: Introduce EPT based Subpage Protection
  KVM: VMX: Add control flags for SPP

[RFC PATCH v6 04/92] kvm: introspection: add the read/dispatch message function

2019-08-09 Thread Adalbert Lazăr

Based on the common header used by all messages (struct kvmi_msg_hdr),
the worker will read/validate all messages, execute the VM introspection
commands (eg. KVMI_GET_GUEST_INFO) and dispatch to vCPUs the vCPU
introspection commands (eg. KVMI_GET_REGISTERS) and the replies to
vCPU events. The vCPU threads will reply to vCPU introspection commands
without the help of the receiving worker.

Because of the command header (struct kvmi_error_code) used in any
command reply, this worker could respond to any unsupported/disallowed
command with an error code.

This thread will end when the socket is closed (signaled by userspace/QEMU
or the introspection tool) or on the first API error (eg. wrong message
size).

Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst |  86 +++
 include/uapi/linux/kvmi.h  |  13 ++
 virt/kvm/kvmi.c|  43 +-
 virt/kvm/kvmi_int.h|   7 +
 virt/kvm/kvmi_msg.c| 240 -
 5 files changed, 386 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 47b7c36d334a..1d4a1dcd7d2f 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -64,6 +64,85 @@ used on that guest. Obviously, whether the guest can really 
continue
 normal execution depends on whether the introspection tool has made any
 modifications that require an active KVMI channel.
 
+All messages (commands or events) have a common header::
+
+   struct kvmi_msg_hdr {
+   __u16 id;
+   __u16 size;
+   __u32 seq;
+   };
+
+The replies have the same header, with the sequence number (``seq``)
+and message id (``id``) matching the command/event.
+
+After ``kvmi_msg_hdr``, ``id`` specific data of ``size`` bytes will
+follow.
+
+The message header and its data must be sent with one ``sendmsg()`` call
+to the socket. This simplifies the receiver loop and avoids
+the reconstruction of messages on the other side.
+
+The wire protocol uses the host native byte-order. The introspection tool
+must check this during the handshake and do the necessary conversion.
+
+A command reply begins with::
+
+   struct kvmi_error_code {
+   __s32 err;
+   __u32 padding;
+   }
+
+followed by the command specific data if the error code ``err`` is zero.
+
+The error code -KVM_EOPNOTSUPP is returned for unsupported commands.
+
+The error code -KVM_EPERM is returned for disallowed commands (see 
**Hooking**).
+
+The error code is related to the message processing, including unsupported
+commands. For all the other errors (incomplete messages, wrong sequence
+numbers, socket errors etc.) the socket will be closed. The device
+manager should reconnect.
+
+While all commands will have a reply as soon as possible, the replies
+to events will probably be delayed until a set of (new) commands will
+complete::
+
+   Host kernel   Tool
+   ---   
+   event 1 ->
+ <- command 1
+   command 1 reply ->
+ <- command 2
+   command 2 reply ->
+ <- event 1 reply
+
+If both ends send a message at the same time::
+
+   Host kernel   Tool
+   ---   
+   event X -><- command X
+
+the host kernel will reply to 'command X', regardless of the receive time
+(before or after the 'event X' was sent).
+
+As it can be seen below, the wire protocol specifies occasional padding. This
+is to permit working with the data by directly using C structures or to round
+the structure size to a multiple of 8 bytes (64bit) to improve the copy
+operations that happen during ``recvmsg()`` or ``sendmsg()``. The members
+should have the native alignment of the host (4 bytes on x86). All padding
+must be initialized with zero otherwise the respective commands will fail
+with -KVM_EINVAL.
+
+To describe the commands/events, we reuse some conventions from api.txt:
+
+  - Architectures: which instruction set architectures provide this 
command/event
+
+  - Versions: which versions provide this command/event
+
+  - Parameters: incoming message data
+
+  - Returns: outgoing/reply message data
+
 Handshake
 -
 
@@ -99,6 +178,13 @@ commands/events) to KVM, and forget about it. It will be 
notified by
 KVM when the introspection tool closes the file handle (in case of
 errors), and should reinitiate the handshake.
 
+Once the file handle reaches KVM, the introspection tool should use
+the *KVMI_GET_VERSION* command to get the API version and/or
+the *KVMI_CHECK_COMMAND* and *KVMI_CHECK_EVENTS* commands to see which
+commands/events are allowed for this guest. The error code -KVM_EPERM
+will be returned if the introspection tool uses a command or enables an
+event which is disallowed.
+
 Unhooking
 -
 
diff --git a/inc

[RFC PATCH v6 06/92] kvm: introspection: add KVMI_CONTROL_CMD_RESPONSE

2019-08-09 Thread Adalbert Lazăr

This command enables/disables the command replies. It is useful when
the introspection tool send multiple messages with one write() call and
doesn't have to wait for a reply.

IIRC, the speed improvment seen during UnixBench tests in a VM
introspected through vsock (the introspection tool was running in a
different VM) was around 5-10%.

Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 50 ++
 include/uapi/linux/kvmi.h  |  7 
 virt/kvm/kvmi_int.h|  2 ++
 virt/kvm/kvmi_msg.c| 57 ++
 4 files changed, 116 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 0f296e3c4244..82de474d512b 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -252,3 +252,53 @@ Returns the introspection API version.
 
 This command is always allowed and successful (if the introspection is
 built in kernel).
+
+2. KVMI_CONTROL_CMD_RESPONSE
+
+
+:Architectures: all
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_control_cmd_response {
+   __u8 enable;
+   __u8 now;
+   __u16 padding1;
+   __u32 padding2;
+   };
+
+:Returns:
+
+::
+   struct kvmi_error_code
+
+Enables or disables the command replies. By default, all commands need
+a reply.
+
+If `now` is 1, the command reply is enabled/disabled (according to
+`enable`) starting with the current command. For example, `enable=0`
+and `now=1` means that the reply is disabled for this command too,
+while `enable=0` and `now=0` means that a reply will be send for this
+command, but not for the next ones (until enabled back with another
+*KVMI_CONTROL_CMD_RESPONSE*).
+
+This command is used by the introspection tool to disable the replies
+for commands returning an error code only (eg. *KVMI_SET_REGISTERS*)
+when an error is less likely to happen. For example, the following
+commands can be used to reply to an event with a single `write()` call:
+
+   KVMI_CONTROL_CMD_RESPONSE enable=0 now=1
+   KVMI_SET_REGISTERS vcpu=N
+   KVMI_EVENT_REPLY   vcpu=N
+   KVMI_CONTROL_CMD_RESPONSE enable=1 now=0
+
+While the command reply is disabled:
+
+* the socket will be closed on any command for which the reply should
+  contain more than just an error code (eg. *KVMI_GET_REGISTERS*)
+
+* the reply status is ignored for any unsupported/unknown or disallowed
+  commands (and ``struct kvmi_error_code`` will be sent with -KVM_EOPNOTSUPP
+  or -KVM_PERM).
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index 9574ba0b9565..a1ab39c5b8e0 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -83,4 +83,11 @@ struct kvmi_get_version_reply {
__u32 padding;
 };
 
+struct kvmi_control_cmd_response {
+   __u8 enable;
+   __u8 now;
+   __u16 padding1;
+   __u32 padding2;
+};
+
 #endif /* _UAPI__LINUX_KVMI_H */
diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h
index 76119a4b69d8..157f765fb34d 100644
--- a/virt/kvm/kvmi_int.h
+++ b/virt/kvm/kvmi_int.h
@@ -85,6 +85,8 @@ struct kvmi {
 
DECLARE_BITMAP(cmd_allow_mask, KVMI_NUM_COMMANDS);
DECLARE_BITMAP(event_allow_mask, KVMI_NUM_EVENTS);
+
+   bool cmd_reply_disabled;
 };
 
 /* kvmi_msg.c */
diff --git a/virt/kvm/kvmi_msg.c b/virt/kvm/kvmi_msg.c
index 6fe04de29f7e..ea5c7e23669a 100644
--- a/virt/kvm/kvmi_msg.c
+++ b/virt/kvm/kvmi_msg.c
@@ -9,6 +9,7 @@
 #include "kvmi_int.h"
 
 static const char *const msg_IDs[] = {
+   [KVMI_CONTROL_CMD_RESPONSE]  = "KVMI_CONTROL_CMD_RESPONSE",
[KVMI_GET_VERSION]   = "KVMI_GET_VERSION",
 };
 
@@ -130,6 +131,36 @@ static int kvmi_msg_vm_reply(struct kvmi *ikvm,
return kvmi_msg_reply(ikvm, msg, err, rpl, rpl_size);
 }
 
+static bool kvmi_validate_no_reply(struct kvmi *ikvm,
+  const struct kvmi_msg_hdr *msg,
+  size_t rpl_size, int err)
+{
+   if (rpl_size) {
+   kvmi_err(ikvm, "Reply disabled for command %d", msg->id);
+   return false;
+   }
+
+   if (err)
+   kvmi_warn(ikvm, "Error code %d discarded for message id %d\n",
+ err, msg->id);
+
+   return true;
+}
+
+static int kvmi_msg_vm_maybe_reply(struct kvmi *ikvm,
+  const struct kvmi_msg_hdr *msg,
+  int err, const void *rpl,
+  size_t rpl_size)
+{
+   if (ikvm->cmd_reply_disabled) {
+   if (!kvmi_validate_no_reply(ikvm, msg, rpl_size, err))
+   return -KVM_EINVAL;
+   return 0;
+   }
+
+   return kvmi_msg_vm_reply(ikvm, msg, err, rpl, rpl_size);
+}
+
 static int handle_get_version(struct kvmi *ikvm,

[RFC PATCH v6 48/92] kvm: add kvm_vcpu_kick_and_wait()

2019-08-09 Thread Adalbert Lazăr

This function is needed for the KVMI_PAUSE_VCPU command. There are
cases when it is easier for the introspection tool if it knows that
the vCPU doesn't run guest code when the command is completed, without
waiting for the KVMI_EVENT_PAUSE_VCPU event.

Signed-off-by: Adalbert Lazăr 
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c  | 10 ++
 2 files changed, 11 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ae4106aae16e..09bc06747642 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -738,6 +738,7 @@ void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+void kvm_vcpu_kick_and_wait(struct kvm_vcpu *vcpu);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2e11069b9565..5256d7321d0e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2370,6 +2370,16 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
 #endif /* !CONFIG_S390 */
 
+void kvm_vcpu_kick_and_wait(struct kvm_vcpu *vcpu)
+{
+   if (kvm_vcpu_wake_up(vcpu))
+   return;
+
+   if (kvm_request_needs_ipi(vcpu, KVM_REQUEST_WAIT))
+   smp_call_function_single(vcpu->cpu, ack_flush, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_kick_and_wait);
+
 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
 {
struct pid *pid;
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 53/92] kvm: introspection: add KVMI_INJECT_EXCEPTION + KVMI_EVENT_TRAP

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

The KVMI_INJECT_EXCEPTION command is used by the introspection tool to
inject exceptions (eg. get a page from swap). The exception is queued
right before entering the guest. If there is already an event pending
(exception, interrupt or NMI) we notify the introspection tool with the
KVMI_EVENT_TRAP event and abort the injection. The introspecion tool is
expected to try again at a later time.

CC: Joerg Roedel 
Signed-off-by: Mihai Donțu 
Co-developed-by: Nicușor Cîțu 
Signed-off-by: Nicușor Cîțu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst |  71 +++
 arch/x86/include/uapi/asm/kvmi.h   |   8 +++
 arch/x86/kvm/kvmi.c| 108 +
 arch/x86/kvm/x86.c |  11 +++
 include/linux/kvmi.h   |   4 ++
 include/uapi/linux/kvmi.h  |   8 +++
 virt/kvm/kvmi.c|  40 +++
 virt/kvm/kvmi_int.h|  16 +
 virt/kvm/kvmi_msg.c|  21 ++
 9 files changed, 287 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 9e15132ed976..1eaed7c61148 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -969,6 +969,44 @@ Returns a CPUID leaf (as seen by the guest OS).
 * -KVM_EAGAIN - the selected vCPU can't be introspected yet
 * -KVM_ENOENT - the selected leaf is not present or is invalid
 
+20. KVMI_INJECT_EXCEPTION
+-
+
+:Architectures: x86
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_inject_exception {
+   __u8 nr;
+   __u8 has_error;
+   __u16 padding;
+   __u32 error_code;
+   __u64 address;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code
+
+Injects a vCPU exception with or without an error code. In case of page fault
+exception, the guest virtual address has to be specified.
+
+The introspection tool should enable the *KVMI_EVENT_TRAP* event in
+order to be notified if the expection was not delivered.
+
+:Errors:
+
+* -KVM_EINVAL - the selected vCPU is invalid
+* -KVM_EINVAL - the specified exception number is invalid
+* -KVM_EINVAL - the specified address is invalid
+* -KVM_EINVAL - padding is not zero
+* -KVM_EAGAIN - the selected vCPU can't be introspected yet
+
 Events
 ==
 
@@ -1167,3 +1205,36 @@ cannot be disabled via *KVMI_CONTROL_EVENTS*.
 This event has a low priority. It will be sent after any other vCPU
 introspection event and when no vCPU introspection command is queued.
 
+5. KVMI_EVENT_TRAP
+--
+
+:Architectures: x86
+:Versions: >= 1
+:Actions: CONTINUE, CRASH
+:Parameters:
+
+::
+
+   struct kvmi_event;
+   struct kvmi_event_trap {
+   __u32 vector;
+   __u32 type;
+   __u32 error_code;
+   __u32 padding;
+   __u64 cr2;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_event_reply;
+
+This event is sent if a previous *KVMI_INJECT_EXCEPTION* command has
+been overwritten by an interrupt picked up during guest reentry and the
+introspection has been enabled for this event (see *KVMI_CONTROL_EVENTS*).
+
+``kvmi_event``, exception/interrupt number (vector), exception/interrupt
+type, exception code (``error_code``) and CR2 are sent to the introspector.
+
diff --git a/arch/x86/include/uapi/asm/kvmi.h b/arch/x86/include/uapi/asm/kvmi.h
index fa2719226198..b074ad735e84 100644
--- a/arch/x86/include/uapi/asm/kvmi.h
+++ b/arch/x86/include/uapi/asm/kvmi.h
@@ -26,6 +26,14 @@ struct kvmi_event_arch {
} msrs;
 };
 
+struct kvmi_event_trap {
+   __u32 vector;
+   __u32 type;
+   __u32 error_code;
+   __u32 padding;
+   __u64 cr2;
+};
+
 struct kvmi_get_registers {
__u16 nmsrs;
__u16 padding1;
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 4615bbe9c0db..8c18030d12f4 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -6,6 +6,7 @@
  */
 #include "x86.h"
 #include "cpuid.h"
+#include 
 #include "../../../virt/kvm/kvmi_int.h"
 
 static void *alloc_get_registers_reply(const struct kvmi_msg_hdr *msg,
@@ -212,6 +213,87 @@ bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, 
gva_t gva,
return ret;
 }
 
+bool kvmi_arch_queue_exception(struct kvm_vcpu *vcpu)
+{
+   if (!vcpu->arch.exception.injected &&
+   !vcpu->arch.interrupt.injected &&
+   !vcpu->arch.nmi_injected) {
+   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
+   struct x86_exception e = {
+   .vector = ivcpu->exception.nr,
+   .error_code_valid = ivcpu->exception.error_code_valid,
+   .error_code = ivcpu->exception.error_code,
+   .address = ivcpu->exceptio

[RFC PATCH v6 37/92] KVM: VMX: Introduce SPP access bitmap and operation functions

2019-08-09 Thread Adalbert Lazăr

From: Yang Weijiang 

Create access bitmap for SPP subpages, 4KB/128B = 32bits,
for each 4KB physical page, 32bits are required. The bitmap can
be easily accessed with a gfn. The initial access bitmap for each
physical page is 0x, meaning SPP is not enabled for the
subpages.

Co-developed-by: He Chen 
Signed-off-by: He Chen 
Co-developed-by: Zhang Yi 
Signed-off-by: Zhang Yi 
Co-developed-by: Yang Weijiang 
Signed-off-by: Yang Weijiang 
Message-Id: <20190717133751.12910-5-weijiang.y...@intel.com>
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c  | 50 +
 arch/x86/kvm/x86.c  | 11 
 3 files changed, 62 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c05984f39923..f0878631b12a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -790,6 +790,7 @@ struct kvm_lpage_info {
 
 struct kvm_arch_memory_slot {
struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
+   u32 *subpage_wp_info;
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
 };
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8a6287cd2be4..f2774bbcfeed 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1482,6 +1482,56 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
return sptep;
 }
 
+#define FULL_SPP_ACCESS((u32)((1ULL << 32) - 1))
+
+static int kvm_subpage_create_bitmaps(struct kvm *kvm)
+{
+   struct kvm_memslots *slots;
+   struct kvm_memory_slot *memslot;
+   int i, j, ret;
+   u32 *buff;
+
+   for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+   slots = __kvm_memslots(kvm, i);
+   kvm_for_each_memslot(memslot, slots) {
+   buff = kvzalloc(memslot->npages*
+   sizeof(*memslot->arch.subpage_wp_info),
+   GFP_KERNEL);
+
+   if (!buff) {
+ ret = -ENOMEM;
+ goto out_free;
+   }
+   memslot->arch.subpage_wp_info = buff;
+
+   for(j = 0; j< memslot->npages; j++)
+ buff[j] = FULL_SPP_ACCESS;
+   }
+   }
+
+   return 0;
+out_free:
+   for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+   slots = __kvm_memslots(kvm, i);
+   kvm_for_each_memslot(memslot, slots) {
+   if (memslot->arch.subpage_wp_info) {
+   kvfree(memslot->arch.subpage_wp_info);
+   memslot->arch.subpage_wp_info = NULL;
+   }
+   }
+   }
+
+   return ret;
+}
+
+static u32 *gfn_to_subpage_wp_info(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+   unsigned long idx;
+
+   idx = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+   return >arch.subpage_wp_info[idx];
+}
+
 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)
\
for (_spte_ = rmap_get_first(_rmap_head_, _iter_);  \
 _spte_; _spte_ = rmap_get_next(_iter_))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ef6d9dd80086..2ac1e0aba1fc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9320,6 +9320,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_hv_destroy_vm(kvm);
 }
 
+void kvm_subpage_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+
+   if (!dont || free->arch.subpage_wp_info !=
+   dont->arch.subpage_wp_info) {
+   kvfree(free->arch.subpage_wp_info);
+   free->arch.subpage_wp_info = NULL;
+   }
+}
+
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
   struct kvm_memory_slot *dont)
 {
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 57/92] kvm: introspection: add KVMI_GET_XSAVE

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This vCPU command is used to get the XSAVE area.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 31 ++
 arch/x86/include/uapi/asm/kvmi.h   |  4 
 arch/x86/kvm/kvmi.c| 21 
 arch/x86/kvm/x86.c |  4 ++--
 include/linux/kvm_host.h   |  2 ++
 virt/kvm/kvmi_int.h|  3 +++
 virt/kvm/kvmi_msg.c| 17 
 7 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index c41c3edb0134..c43ea1b33a51 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -1081,6 +1081,37 @@ to control events for any other register will fail with 
-KVM_EINVAL::
 * -KVM_EINVAL - padding is not zero
 * -KVM_EAGAIN - the selected vCPU can't be introspected yet
 
+23. KVMI_GET_XSAVE
+--
+
+:Architecture: x86
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_vcpu_hdr;
+
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+   struct kvmi_get_xsave_reply {
+   __u32 region[0];
+   };
+
+Returns a buffer containing the XSAVE area. Currently, the size of
+``kvm_xsave`` is used, but it could change. The userspace should get
+the buffer size from the message size.
+
+:Errors:
+
+* -KVM_EINVAL - the selected vCPU is invalid
+* -KVM_EINVAL - padding is not zero
+* -KVM_EAGAIN - the selected vCPU can't be introspected yet
+* -KVM_ENOMEM - not enough memory to allocate the reply
+
 Events
 ==
 
diff --git a/arch/x86/include/uapi/asm/kvmi.h b/arch/x86/include/uapi/asm/kvmi.h
index 08af2eccbdfb..a3fcb1ef8404 100644
--- a/arch/x86/include/uapi/asm/kvmi.h
+++ b/arch/x86/include/uapi/asm/kvmi.h
@@ -97,4 +97,8 @@ struct kvmi_event_msr_reply {
__u64 new_val;
 };
 
+struct kvmi_get_xsave_reply {
+   __u32 region[0];
+};
+
 #endif /* _UAPI_ASM_X86_KVMI_H */
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index fc6956b50da2..078d714b59d5 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -790,3 +790,24 @@ int kvmi_arch_cmd_control_spp(struct kvmi *ikvm)
 {
return kvm_arch_init_spp(ikvm->kvm);
 }
+
+int kvmi_arch_cmd_get_xsave(struct kvm_vcpu *vcpu,
+   struct kvmi_get_xsave_reply **dest,
+   size_t *dest_size)
+{
+   struct kvmi_get_xsave_reply *rpl = NULL;
+   size_t rpl_size = sizeof(*rpl) + sizeof(struct kvm_xsave);
+   struct kvm_xsave *area;
+
+   rpl = kvmi_msg_alloc_check(rpl_size);
+   if (!rpl)
+   return -KVM_ENOMEM;
+
+   area = (struct kvm_xsave *) >region[0];
+   kvm_vcpu_ioctl_x86_get_xsave(vcpu, area);
+
+   *dest = rpl;
+   *dest_size = rpl_size;
+
+   return 0;
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ac027471c4f3..05ff23180355 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3745,8 +3745,8 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
}
 }
 
-static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
-struct kvm_xsave *guest_xsave)
+void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
 {
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
memset(guest_xsave, 0, sizeof(struct kvm_xsave));
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c8eb1a4d997f..3aad3b96107b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -805,6 +805,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg);
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
+void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave);
 
 int kvm_arch_init(void *opaque);
 void kvm_arch_exit(void);
diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h
index 640a78b54947..1a705cba4776 100644
--- a/virt/kvm/kvmi_int.h
+++ b/virt/kvm/kvmi_int.h
@@ -255,6 +255,9 @@ void kvmi_arch_trap_event(struct kvm_vcpu *vcpu);
 int kvmi_arch_cmd_get_cpuid(struct kvm_vcpu *vcpu,
const struct kvmi_get_cpuid *req,
struct kvmi_get_cpuid_reply *rpl);
+int kvmi_arch_cmd_get_xsave(struct kvm_vcpu *vcpu,
+   struct kvmi_get_xsave_reply **dest,
+   size_t *dest_size);
 int kvmi_arch_cmd_get_vcpu_info(struct kvm_vcpu *vcpu,
struct kvmi_get_vcpu_info_reply *rpl);
 int kvmi_arch_cmd_inject_exception(struct kvm_vcpu *vcpu, u8 vector,
diff --git a/virt/kvm/kvmi_msg.c b/virt/kvm/kvmi_msg.c
index 8a8951f13f8e..6bc18b7973cf 100644
--- a

[RFC PATCH v6 72/92] kvm: introspection: add memory map/unmap support on the guest side

2019-08-09 Thread Adalbert Lazăr

From: Mircea Cîrjaliu 

An introspection tool running in a dedicated VM can use the new device
(/dev/kvmmem) to map memory from other introspected VM-s.

Two ioctl operations are supported:
  - KVM_HC_MEM_MAP/struct kvmi_mem_map
  - KVM_HC_MEM_UNMAP/unsigned long

In order to map an introspected gpa to the local gva, the process using
this device needs to obtain a token from the host KVMI subsystem (see
Documentation/virtual/kvm/kvmi.rst - KVMI_GET_MAP_TOKEN).

Both operations use hypercalls (KVM_HC_MEM_MAP, KVM_HC_MEM_UNMAP)
to pass the requests to the host kernel/KVMi (see hypercalls.txt).

Signed-off-by: Mircea Cîrjaliu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/hypercalls.txt |  34 ++
 arch/x86/Kconfig |   9 +
 arch/x86/include/asm/kvmi_guest.h|  10 +
 arch/x86/kernel/Makefile |   1 +
 arch/x86/kernel/kvmi_mem_guest.c |  26 +
 include/uapi/linux/kvm_para.h|   2 +
 include/uapi/linux/kvmi.h|  21 +
 virt/kvm/kvmi_mem_guest.c| 651 +++
 8 files changed, 754 insertions(+)
 create mode 100644 arch/x86/include/asm/kvmi_guest.h
 create mode 100644 arch/x86/kernel/kvmi_mem_guest.c
 create mode 100644 virt/kvm/kvmi_mem_guest.c

diff --git a/Documentation/virtual/kvm/hypercalls.txt 
b/Documentation/virtual/kvm/hypercalls.txt
index 1ab59537b2fb..a47fae926201 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -173,3 +173,37 @@ The following registers are clobbered:
 In particular, for KVM_HC_XEN_HVM_OP_GUEST_REQUEST_VM_EVENT, the last two
 registers can be poisoned deliberately and cannot be used for passing
 information.
+
+9. KVM_HC_MEM_MAP
+-
+
+Architecture: x86
+Status: active
+Purpose: Map a guest physical page to another VM (the introspector).
+Usage:
+
+a0: pointer to a token obtained with a KVMI_GET_MAP_TOKEN command (see 
kvmi.rst)
+   struct kvmi_map_mem_token {
+   __u64 token[4];
+   };
+
+a1: guest physical address to be mapped
+
+a2: guest physical address from introspector that will be replaced
+
+Both guest physical addresses will end up poiting to the same physical page.
+
+Returns any error that the memory manager can return.
+
+10. KVM_HC_MEM_UNMAP
+---
+
+Architecture: x86
+Status: active
+Purpose: Unmap a previously mapped page.
+Usage:
+
+a0: guest physical address from introspector
+
+The address will stop pointing to the introspected page and a new physical
+page is allocated for this gpa.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 68261430fe6e..a7527c1f90a0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -820,6 +820,15 @@ config KVM_DEBUG_FS
  Statistics are displayed in debugfs filesystem. Enabling this option
  may incur significant overhead.
 
+config KVM_INTROSPECTION_GUEST
+   bool "KVM Memory Introspection support on Guest"
+   depends on KVM_GUEST
+   default n
+   help
+ This option enables functions and hypercalls for security applications
+ running in a separate VM to control the execution of other VM-s, query
+ the state of the vCPU-s (GPR-s, MSR-s etc.).
+
 config PARAVIRT_TIME_ACCOUNTING
bool "Paravirtual steal time accounting"
depends on PARAVIRT
diff --git a/arch/x86/include/asm/kvmi_guest.h 
b/arch/x86/include/asm/kvmi_guest.h
new file mode 100644
index ..c7ed53a938e0
--- /dev/null
+++ b/arch/x86/include/asm/kvmi_guest.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVMI_GUEST_H__
+#define __KVMI_GUEST_H__
+
+long kvmi_arch_map_hc(struct kvmi_map_mem_token *tknp,
+   gpa_t req_gpa, gpa_t map_gpa);
+long kvmi_arch_unmap_hc(gpa_t map_gpa);
+
+
+#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 00b7e27bc2b7..995652ba53b3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -116,6 +116,7 @@ obj-$(CONFIG_PARAVIRT)  += paravirt.o 
paravirt_patch_$(BITS).o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)   += pvclock.o
 obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
+obj-$(CONFIG_KVM_INTROSPECTION_GUEST)  += kvmi_mem_guest.o 
../../../virt/kvm/kvmi_mem_guest.o
 
 obj-$(CONFIG_JAILHOUSE_GUEST)  += jailhouse.o
 
diff --git a/arch/x86/kernel/kvmi_mem_guest.c b/arch/x86/kernel/kvmi_mem_guest.c
new file mode 100644
index ..c4e2613f90f3
--- /dev/null
+++ b/arch/x86/kernel/kvmi_mem_guest.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM introspection guest implementation
+ *
+ * Copyright (C) 2017 Bitdefender S.R.L.
+ *
+ * Author:
+ *   Mircea Cirjaliu 
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+long kvmi_arch_map_hc(struct kvmi_map_mem_token *tknp,
+  gpa_t req_gpa, gpa_t map_gpa)
+{
+   return kvm_hypercall3(KVM_HC_MEM_MAP,

[RFC PATCH v6 52/92] kvm: introspection: add KVMI_GET_CPUID

2019-08-09 Thread Adalbert Lazăr

From: Marian Rotariu 

This command returns a CPUID leaf (as seen by the guest OS).

Signed-off-by: Marian Rotariu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 36 ++
 arch/x86/include/uapi/asm/kvmi.h   | 12 ++
 arch/x86/kvm/kvmi.c| 19 
 include/uapi/linux/kvm_para.h  |  1 +
 virt/kvm/kvmi_int.h|  3 +++
 virt/kvm/kvmi_msg.c| 16 +
 6 files changed, 87 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index b6722d071ab7..9e15132ed976 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -933,6 +933,42 @@ currently being handled is replied to.
 * -KVM_EINVAL - padding is not zero
 * -KVM_EAGAIN - the selected vCPU can't be introspected yet
 
+19. KVMI_GET_CPUID
+--
+
+:Architectures: x86
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_get_cpuid {
+   __u32 function;
+   __u32 index;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+   struct kvmi_get_cpuid_reply {
+   __u32 eax;
+   __u32 ebx;
+   __u32 ecx;
+   __u32 edx;
+   };
+
+Returns a CPUID leaf (as seen by the guest OS).
+
+:Errors:
+
+* -KVM_EINVAL - the selected vCPU is invalid
+* -KVM_EINVAL - padding is not zero
+* -KVM_EAGAIN - the selected vCPU can't be introspected yet
+* -KVM_ENOENT - the selected leaf is not present or is invalid
+
 Events
 ==
 
diff --git a/arch/x86/include/uapi/asm/kvmi.h b/arch/x86/include/uapi/asm/kvmi.h
index 98fb27e1273c..fa2719226198 100644
--- a/arch/x86/include/uapi/asm/kvmi.h
+++ b/arch/x86/include/uapi/asm/kvmi.h
@@ -41,4 +41,16 @@ struct kvmi_get_registers_reply {
struct kvm_msrs msrs;
 };
 
+struct kvmi_get_cpuid {
+   __u32 function;
+   __u32 index;
+};
+
+struct kvmi_get_cpuid_reply {
+   __u32 eax;
+   __u32 ebx;
+   __u32 ecx;
+   __u32 edx;
+};
+
 #endif /* _UAPI_ASM_X86_KVMI_H */
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index a78771b21d2f..4615bbe9c0db 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2019 Bitdefender S.R.L.
  */
 #include "x86.h"
+#include "cpuid.h"
 #include "../../../virt/kvm/kvmi_int.h"
 
 static void *alloc_get_registers_reply(const struct kvmi_msg_hdr *msg,
@@ -211,6 +212,24 @@ bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, 
gva_t gva,
return ret;
 }
 
+int kvmi_arch_cmd_get_cpuid(struct kvm_vcpu *vcpu,
+   const struct kvmi_get_cpuid *req,
+   struct kvmi_get_cpuid_reply *rpl)
+{
+   struct kvm_cpuid_entry2 *e;
+
+   e = kvm_find_cpuid_entry(vcpu, req->function, req->index);
+   if (!e)
+   return -KVM_ENOENT;
+
+   rpl->eax = e->eax;
+   rpl->ebx = e->ebx;
+   rpl->ecx = e->ecx;
+   rpl->edx = e->edx;
+
+   return 0;
+}
+
 int kvmi_arch_cmd_get_vcpu_info(struct kvm_vcpu *vcpu,
struct kvmi_get_vcpu_info_reply *rpl)
 {
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index 07e3f2662b36..553f168331a4 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -19,6 +19,7 @@
 #define KVM_EOPNOTSUPP 95
 #define KVM_EAGAIN 11
 #define KVM_EBUSY  EBUSY
+#define KVM_ENOENT ENOENT
 #define KVM_ENOMEM ENOMEM
 
 #define KVM_HC_VAPIC_POLL_IRQ  1
diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h
index 7bc3dd1f2298..22508d147495 100644
--- a/virt/kvm/kvmi_int.h
+++ b/virt/kvm/kvmi_int.h
@@ -230,6 +230,9 @@ int kvmi_arch_cmd_set_page_write_bitmap(struct kvmi *ikvm,
 void kvmi_arch_setup_event(struct kvm_vcpu *vcpu, struct kvmi_event *ev);
 bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
u8 access);
+int kvmi_arch_cmd_get_cpuid(struct kvm_vcpu *vcpu,
+   const struct kvmi_get_cpuid *req,
+   struct kvmi_get_cpuid_reply *rpl);
 int kvmi_arch_cmd_get_vcpu_info(struct kvm_vcpu *vcpu,
struct kvmi_get_vcpu_info_reply *rpl);
 
diff --git a/virt/kvm/kvmi_msg.c b/virt/kvm/kvmi_msg.c
index 355cec70a28d..9548042de618 100644
--- a/virt/kvm/kvmi_msg.c
+++ b/virt/kvm/kvmi_msg.c
@@ -29,6 +29,7 @@ static const char *const msg_IDs[] = {
[KVMI_CONTROL_VM_EVENTS] = "KVMI_CONTROL_VM_EVENTS",
[KVMI_EVENT] = "KVMI_EVENT",
[KVMI_EVENT_REPLY]   = "KVMI_EVENT_REPLY",
+   [KVMI_GET_CPUID] = "KVMI_GET_CPUID",
[KVMI_GET_GUEST_INFO]= "KVMI_GET_GUEST_INFO",
[KVMI_GE

[RFC PATCH v6 92/92] kvm: x86: fallback to the single-step on multipage CMPXCHG emulation

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

There are cases where we need to emulate a CMPXCHG that touches two
pages (4 in one and another 4 in the next, for example). Because it
is not easy to map two pages in the kernel so that we can directly
execute the exchange instruction, we fallback to single-stepping.
Luckly, this is an uncommon occurrence making the overhead of the
single-step mechanism acceptable.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/x86.c | 15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0e904782d303..e283b074db26 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5671,6 +5671,12 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
 #define CMPXCHG_MAX_BYTES 8
 #endif
 
+   gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+
+   if (gpa == UNMAPPED_GVA ||
+   (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+   goto emul_write;
+
/* guests cmpxchg{8,16}b have to be emulated atomically */
if (bytes > CMPXCHG_MAX_BYTES || (bytes & (bytes - 1)))
goto emul_write;
@@ -5678,12 +5684,6 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
if (bytes == 16 && !system_has_cmpxchg_double())
goto emul_write;
 
-   gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
-
-   if (gpa == UNMAPPED_GVA ||
-   (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-   goto emul_write;
-
if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
goto emul_write;
 
@@ -5772,6 +5772,9 @@ static int emulator_cmpxchg_emulated(struct 
x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
 
 emul_write:
+   if (kvmi_tracked_gfn(vcpu, gpa >> PAGE_SHIFT))
+   return X86EMUL_UNHANDLEABLE;
+
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
return emulator_write_emulated(ctxt, addr, new, bytes, exception);
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 64/92] kvm: introspection: add single-stepping

2019-08-09 Thread Adalbert Lazăr

From: Nicușor Cîțu 

This would be used either if the introspection tool request it as a
reply to a KVMI_EVENT_PF event or to cope with instructions that cannot
be handled by the x86 emulator during the handling of a VMEXIT. In
these situations, all other vCPU-s are kicked and held, the EPT-based
protection is removed and the guest is single stepped by the vCPU that
triggered the initial VMEXIT. Upon completion the EPT-base protection
is reinstalled and all vCPU-s all allowed to return to the guest.

This is a rather slow workaround that kicks in occasionally. In the
future, the most frequently single-stepped instructions should be added
to the emulator (usually, stores to and from memory - SSE/AVX).

For the moment it works only on Intel.

CC: Jim Mattson 
CC: Sean Christopherson 
CC: Joerg Roedel 
Signed-off-by: Nicușor Cîțu 
Co-developed-by: Mihai Donțu 
Signed-off-by: Mihai Donțu 
Co-developed-by: Adalbert Lazăr 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/kvm_host.h |   3 +
 arch/x86/kvm/kvmi.c |  47 ++-
 arch/x86/kvm/svm.c  |   5 ++
 arch/x86/kvm/vmx/vmx.c  |  17 
 arch/x86/kvm/x86.c  |  19 +
 include/linux/kvmi.h|   4 +
 virt/kvm/kvmi.c | 145 +++-
 virt/kvm/kvmi_int.h |  16 
 8 files changed, 253 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ad36a5fc2048..60e2c298d469 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1016,6 +1016,7 @@ struct kvm_x86_ops {
void (*msr_intercept)(struct kvm_vcpu *vcpu, unsigned int msr,
bool enable);
bool (*desc_intercept)(struct kvm_vcpu *vcpu, bool enable);
+   void (*set_mtf)(struct kvm_vcpu *vcpu, bool enable);
void (*cr3_write_exiting)(struct kvm_vcpu *vcpu, bool enable);
bool (*nested_pagefault)(struct kvm_vcpu *vcpu);
bool (*spt_fault)(struct kvm_vcpu *vcpu);
@@ -1628,6 +1629,8 @@ void kvm_arch_msr_intercept(struct kvm_vcpu *vcpu, 
unsigned int msr,
bool enable);
 bool kvm_mmu_nested_pagefault(struct kvm_vcpu *vcpu);
 bool kvm_spt_fault(struct kvm_vcpu *vcpu);
+void kvm_set_mtf(struct kvm_vcpu *vcpu, bool enable);
+void kvm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
 void kvm_control_cr3_write_exiting(struct kvm_vcpu *vcpu, bool enable);
 
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 04cac5b8a4d0..f0ab4bd9eb37 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -520,7 +520,6 @@ bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, 
gva_t gva,
u32 ctx_size;
u64 ctx_addr;
u32 action;
-   bool singlestep_ignored;
bool ret = false;
 
if (!kvm_spt_fault(vcpu))
@@ -533,7 +532,7 @@ bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, 
gva_t gva,
if (ivcpu->effective_rep_complete)
return true;
 
-   action = kvmi_msg_send_pf(vcpu, gpa, gva, access, _ignored,
+   action = kvmi_msg_send_pf(vcpu, gpa, gva, access, >ss_requested,
  >rep_complete, _addr,
  ivcpu->ctx_data, _size);
 
@@ -547,6 +546,8 @@ bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, 
gva_t gva,
ret = true;
break;
case KVMI_EVENT_ACTION_RETRY:
+   if (ivcpu->ss_requested && !kvmi_start_ss(vcpu, gpa, access))
+   ret = true;
break;
default:
kvmi_handle_common_event_actions(vcpu, action, "PF");
@@ -758,6 +759,48 @@ int kvmi_arch_cmd_control_cr(struct kvm_vcpu *vcpu,
return 0;
 }
 
+void kvmi_arch_start_single_step(struct kvm_vcpu *vcpu)
+{
+   kvm_set_mtf(vcpu, true);
+
+   /*
+* Set block by STI only if the RFLAGS.IF = 1.
+* Blocking by both STI and MOV/POP SS is not possible.
+*/
+   if (kvm_arch_interrupt_allowed(vcpu))
+   kvm_set_interrupt_shadow(vcpu, KVM_X86_SHADOW_INT_STI);
+
+}
+
+void kvmi_arch_stop_single_step(struct kvm_vcpu *vcpu)
+{
+   kvm_set_mtf(vcpu, false);
+   /*
+* The blocking by STI is cleared after the guest
+* executes one instruction or incurs an exception.
+* However we migh stop the SS before entering to guest,
+* so be sure we are clearing the STI blocking.
+*/
+   kvm_set_interrupt_shadow(vcpu, 0);
+}
+
+u8 kvmi_arch_relax_page_access(u8 old, u8 new)
+{
+   u8 ret = old | new;
+
+   /*
+* An SPTE entry with just the -wx bits set can trigger a
+* misconfiguration error from the hardware, as it's the case
+* for x86 where this access mode is used to mark I/O memory.
+* Thus, we make sure that -

[RFC PATCH v6 76/92] kvm: x86: disable EPT A/D bits if introspection is present

2019-08-09 Thread Adalbert Lazăr

Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/vmx/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index dc648ba47df3..152c58b63f69 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7718,7 +7718,7 @@ static __init int hardware_setup(void)
!cpu_has_vmx_invept_global())
enable_ept = 0;
 
-   if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
+   if (!cpu_has_vmx_ept_ad_bits() || !enable_ept || kvmi_is_present())
enable_ept_ad_bits = 0;
 
if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 77/92] kvm: introspection: add trace functions

2019-08-09 Thread Adalbert Lazăr

Co-developed-by: Nicușor Cîțu 
Signed-off-by: Nicușor Cîțu 
Co-developed-by: Mircea Cîrjaliu 
Signed-off-by: Mircea Cîrjaliu 
Co-developed-by: Marian Rotariu 
Signed-off-by: Marian Rotariu 
Co-developed-by: Adalbert Lazăr 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/kvmi.c |  63 
 include/trace/events/kvmi.h | 680 
 virt/kvm/kvmi.c |  20 ++
 virt/kvm/kvmi_mem.c |   5 +
 virt/kvm/kvmi_msg.c |  16 +
 5 files changed, 784 insertions(+)
 create mode 100644 include/trace/events/kvmi.h

diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 5312f179af9c..171e76449271 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -9,6 +9,8 @@
 #include 
 #include "../../../virt/kvm/kvmi_int.h"
 
+#include 
+
 static unsigned long *msr_mask(struct kvm_vcpu *vcpu, unsigned int *msr)
 {
switch (*msr) {
@@ -102,6 +104,9 @@ static bool __kvmi_msr_event(struct kvm_vcpu *vcpu, struct 
msr_data *msr)
if (old_msr.data == msr->data)
return true;
 
+   trace_kvmi_event_msr_send(vcpu->vcpu_id, msr->index, old_msr.data,
+ msr->data);
+
action = kvmi_send_msr(vcpu, msr->index, old_msr.data, msr->data,
   _value);
switch (action) {
@@ -113,6 +118,8 @@ static bool __kvmi_msr_event(struct kvm_vcpu *vcpu, struct 
msr_data *msr)
kvmi_handle_common_event_actions(vcpu, action, "MSR");
}
 
+   trace_kvmi_event_msr_recv(vcpu->vcpu_id, action, ret_value);
+
return ret;
 }
 
@@ -387,6 +394,8 @@ static bool __kvmi_cr_event(struct kvm_vcpu *vcpu, unsigned 
int cr,
if (!test_bit(cr, IVCPU(vcpu)->cr_mask))
return true;
 
+   trace_kvmi_event_cr_send(vcpu->vcpu_id, cr, old_value, *new_value);
+
action = kvmi_send_cr(vcpu, cr, old_value, *new_value, _value);
switch (action) {
case KVMI_EVENT_ACTION_CONTINUE:
@@ -397,6 +406,8 @@ static bool __kvmi_cr_event(struct kvm_vcpu *vcpu, unsigned 
int cr,
kvmi_handle_common_event_actions(vcpu, action, "CR");
}
 
+   trace_kvmi_event_cr_recv(vcpu->vcpu_id, action, ret_value);
+
return ret;
 }
 
@@ -437,6 +448,8 @@ static void __kvmi_xsetbv_event(struct kvm_vcpu *vcpu)
 {
u32 action;
 
+   trace_kvmi_event_xsetbv_send(vcpu->vcpu_id);
+
action = kvmi_send_xsetbv(vcpu);
switch (action) {
case KVMI_EVENT_ACTION_CONTINUE:
@@ -444,6 +457,8 @@ static void __kvmi_xsetbv_event(struct kvm_vcpu *vcpu)
default:
kvmi_handle_common_event_actions(vcpu, action, "XSETBV");
}
+
+   trace_kvmi_event_xsetbv_recv(vcpu->vcpu_id, action);
 }
 
 void kvmi_xsetbv_event(struct kvm_vcpu *vcpu)
@@ -460,12 +475,26 @@ void kvmi_xsetbv_event(struct kvm_vcpu *vcpu)
kvmi_put(vcpu->kvm);
 }
 
+static u64 get_next_rip(struct kvm_vcpu *vcpu)
+{
+   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
+
+   if (ivcpu->have_delayed_regs)
+   return ivcpu->delayed_regs.rip;
+   else
+   return kvm_rip_read(vcpu);
+}
+
 void kvmi_arch_breakpoint_event(struct kvm_vcpu *vcpu, u64 gva, u8 insn_len)
 {
u32 action;
u64 gpa;
+   u64 old_rip;
 
gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, 0, NULL);
+   old_rip = kvm_rip_read(vcpu);
+
+   trace_kvmi_event_bp_send(vcpu->vcpu_id, gpa, old_rip);
 
action = kvmi_msg_send_bp(vcpu, gpa, insn_len);
switch (action) {
@@ -478,6 +507,8 @@ void kvmi_arch_breakpoint_event(struct kvm_vcpu *vcpu, u64 
gva, u8 insn_len)
default:
kvmi_handle_common_event_actions(vcpu, action, "BP");
}
+
+   trace_kvmi_event_bp_recv(vcpu->vcpu_id, action, get_next_rip(vcpu));
 }
 
 #define KVM_HC_XEN_HVM_OP_GUEST_REQUEST_VM_EVENT 24
@@ -504,6 +535,8 @@ void kvmi_arch_hypercall_event(struct kvm_vcpu *vcpu)
 {
u32 action;
 
+   trace_kvmi_event_hc_send(vcpu->vcpu_id);
+
action = kvmi_msg_send_hypercall(vcpu);
switch (action) {
case KVMI_EVENT_ACTION_CONTINUE:
@@ -511,6 +544,8 @@ void kvmi_arch_hypercall_event(struct kvm_vcpu *vcpu)
default:
kvmi_handle_common_event_actions(vcpu, action, "HYPERCALL");
}
+
+   trace_kvmi_event_hc_recv(vcpu->vcpu_id, action);
 }
 
 bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
@@ -532,6 +567,9 @@ bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, 
gva_t gva,
if (ivcpu->effective_rep_complete)
return true;
 
+   trace_kvmi_event_pf_send(vcpu->vcpu_id, gpa, gva, access,
+kvm_rip_read(vcpu));
+
action = kvmi_msg_send_pf(vcpu, gpa, gva, access, >ss_requested,
  >rep_complete, _a

[RFC PATCH v6 90/92] kvm: x86: emulate lock cmpxchg8b atomically

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

As it was the case for lock cmpxchg, lock cmpxchg8b was emulated in two
steps the first one setting/clearing the zero flag and the last one
making the actual atomic operation.

This patch fixes that by combining the two, ie. the writeback step is
no longer necessary as the first step made the changes directly in
memory.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/emulate.c | 42 +-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index dac4c0ca1ee3..2038e42c1eae 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2320,7 +2320,47 @@ static int em_call_near_abs(struct x86_emulate_ctxt 
*ctxt)
 
 static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
 {
-   u64 old = ctxt->dst.orig_val64;
+   u64 old;
+
+   if (ctxt->lock_prefix) {
+   int rc;
+   ulong linear;
+   u64 new = (reg_read(ctxt, VCPU_REGS_RBX) & (u32)-1) |
+   ((reg_read(ctxt, VCPU_REGS_RCX) & (u32)-1) << 32);
+
+   old = (reg_read(ctxt, VCPU_REGS_RAX) & (u32)-1) |
+   ((reg_read(ctxt, VCPU_REGS_RDX) & (u32)-1) << 32);
+
+   /* disable writeback altogether */
+   ctxt->d &= ~SrcWrite;
+   ctxt->d |= NoWrite;
+
+   rc = linearize(ctxt, ctxt->dst.addr.mem, 8, true, );
+   if (rc != X86EMUL_CONTINUE)
+   return rc;
+
+   rc = ctxt->ops->cmpxchg_emulated(ctxt, linear, , ,
+ctxt->dst.bytes,
+>exception);
+
+   switch (rc) {
+   case X86EMUL_CONTINUE:
+   ctxt->eflags |= X86_EFLAGS_ZF;
+   break;
+   case X86EMUL_CMPXCHG_FAILED:
+   *reg_write(ctxt, VCPU_REGS_RAX) = old & (u32)-1;
+   *reg_write(ctxt, VCPU_REGS_RDX) = (old >> 32) & (u32)-1;
+
+   ctxt->eflags &= ~X86_EFLAGS_ZF;
+
+   rc = X86EMUL_CONTINUE;
+   break;
+   }
+
+   return rc;
+   }
+
+   old = ctxt->dst.orig_val64;
 
if (ctxt->dst.bytes == 16)
return X86EMUL_UNHANDLEABLE;
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 01/92] kvm: introduce KVMI (VM introspection subsystem)

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

Besides the pointer to the new structure, the patch adds to the kvm
structure a reference counter (the new object will be used by the thread
receiving introspection commands/events) and a completion variable
(to signal that the VM can be hooked by the introspection tool).

Signed-off-by: Mihai Donțu 
Co-developed-by: Mircea Cîrjaliu 
Signed-off-by: Mircea Cîrjaliu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 75 ++
 arch/x86/kvm/Kconfig   |  7 +++
 arch/x86/kvm/Makefile  |  1 +
 include/linux/kvm_host.h   |  4 ++
 include/linux/kvmi.h   | 23 +
 include/uapi/linux/kvmi.h  | 68 +++
 virt/kvm/kvm_main.c| 10 +++-
 virt/kvm/kvmi.c| 64 +
 virt/kvm/kvmi_int.h| 12 +
 9 files changed, 263 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/virtual/kvm/kvmi.rst
 create mode 100644 include/linux/kvmi.h
 create mode 100644 include/uapi/linux/kvmi.h
 create mode 100644 virt/kvm/kvmi.c
 create mode 100644 virt/kvm/kvmi_int.h

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
new file mode 100644
index ..d54caf8d974f
--- /dev/null
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -0,0 +1,75 @@
+=
+KVMI - The kernel virtual machine introspection subsystem
+=
+
+The KVM introspection subsystem provides a facility for applications running
+on the host or in a separate VM, to control the execution of other VM-s
+(pause, resume, shutdown), query the state of the vCPUs (GPRs, MSRs etc.),
+alter the page access bits in the shadow page tables (only for the hardware
+backed ones, eg. Intel's EPT) and receive notifications when events of
+interest have taken place (shadow page table level faults, key MSR writes,
+hypercalls etc.). Some notifications can be responded to with an action
+(like preventing an MSR from being written), others are mere informative
+(like breakpoint events which can be used for execution tracing).
+With few exceptions, all events are optional. An application using this
+subsystem will explicitly register for them.
+
+The use case that gave way for the creation of this subsystem is to monitor
+the guest OS and as such the ABI/API is highly influenced by how the guest
+software (kernel, applications) sees the world. For example, some events
+provide information specific for the host CPU architecture
+(eg. MSR_IA32_SYSENTER_EIP) merely because its leveraged by guest software
+to implement a critical feature (fast system calls).
+
+At the moment, the target audience for KVMI are security software authors
+that wish to perform forensics on newly discovered threats (exploits) or
+to implement another layer of security like preventing a large set of
+kernel rootkits simply by "locking" the kernel image in the shadow page
+tables (ie. enforce .text r-x, .rodata rw- etc.). It's the latter case that
+made KVMI a separate subsystem, even though many of these features are
+available in the device manager (eg. QEMU). The ability to build a security
+application that does not interfere (in terms of performance) with the
+guest software asks for a specialized interface that is designed for minimum
+overhead.
+
+API/ABI
+===
+
+This chapter describes the VMI interface used to monitor and control local
+guests from a user application.
+
+Overview
+
+
+The interface is socket based, one connection for every VM. One end is in the
+host kernel while the other is held by the user application (introspection
+tool).
+
+The initial connection is established by an application running on the host
+(eg. QEMU) that connects to the introspection tool and after a handshake the
+socket is passed to the host kernel making all further communication take
+place between it and the introspection tool. The initiating party (QEMU) can
+close its end so that any potential exploits cannot take a hold of it.
+
+The socket protocol allows for commands and events to be multiplexed over
+the same connection. As such, it is possible for the introspection tool to
+receive an event while waiting for the result of a command. Also, it can
+send a command while the host kernel is waiting for a reply to an event.
+
+The kernel side of the socket communication is blocking and will wait for
+an answer from its peer indefinitely or until the guest is powered off
+(killed), restarted or the peer goes away, at which point it will wake
+up and properly cleanup as if the introspection subsystem has never been
+used on that guest. Obviously, whether the guest can really continue
+normal execution depends on whether the introspection tool has made any
+modifications that require an active KVMI channel.
+
+Memory acc

[RFC PATCH v6 05/92] kvm: introspection: add KVMI_GET_VERSION

2019-08-09 Thread Adalbert Lazăr

This command should be used by the introspection tool to identify the
commands/events supported by the KVMi subsystem and, most important,
what messages must be used for event replies. The kernel side will accept
smaller or bigger command messages, but it can be more strict with bigger
event reply messages.

The command is always allowed and any attempt from userspace to disallow it
through KVM_INTROSPECTION_COMMAND will get -EPERM (unless userspace choose
to disable all commands, using id=-1, in which case KVMI_GET_VERSION is
quietly allowed, without an error).

Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 28 
 include/uapi/linux/kvmi.h  |  5 +
 virt/kvm/kvmi.c| 14 ++
 virt/kvm/kvmi_msg.c| 13 +
 4 files changed, 60 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 1d4a1dcd7d2f..0f296e3c4244 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -224,3 +224,31 @@ device-specific memory (DMA, emulated MMIO, reserved by a 
passthrough
 device etc.). It is up to the user to determine, using the guest operating
 system data structures, the areas that are safe to access (code, stack, heap
 etc.).
+
+Commands
+
+
+The following C structures are meant to be used directly when communicating
+over the wire. The peer that detects any size mismatch should simply close
+the connection and report the error.
+
+1. KVMI_GET_VERSION
+---
+
+:Architectures: all
+:Versions: >= 1
+:Parameters: none
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+   struct kvmi_get_version_reply {
+   __u32 version;
+   __u32 padding;
+   };
+
+Returns the introspection API version.
+
+This command is always allowed and successful (if the introspection is
+built in kernel).
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index 6c7600ed4564..9574ba0b9565 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -78,4 +78,9 @@ struct kvmi_error_code {
__u32 padding;
 };
 
+struct kvmi_get_version_reply {
+   __u32 version;
+   __u32 padding;
+};
+
 #endif /* _UAPI__LINUX_KVMI_H */
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index afa31748d7f4..d5b6af21564e 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -68,6 +68,8 @@ static bool alloc_kvmi(struct kvm *kvm, const struct 
kvm_introspection *qemu)
if (!ikvm)
return false;
 
+   set_bit(KVMI_GET_VERSION, ikvm->cmd_allow_mask);
+
memcpy(>uuid, >uuid, sizeof(ikvm->uuid));
 
ikvm->kvm = kvm;
@@ -290,6 +292,18 @@ int kvmi_ioctl_command(struct kvm *kvm, void __user *argp)
bitmap_from_u64(known, KVMI_KNOWN_COMMANDS);
bitmap_and(requested, requested, known, KVMI_NUM_COMMANDS);
 
+   if (!allow) {
+   DECLARE_BITMAP(always_allowed, KVMI_NUM_COMMANDS);
+
+   if (id == KVMI_GET_VERSION)
+   return -EPERM;
+
+   set_bit(KVMI_GET_VERSION, always_allowed);
+
+   bitmap_andnot(requested, requested, always_allowed,
+ KVMI_NUM_COMMANDS);
+   }
+
return kvmi_ioctl_feature(kvm, allow, requested,
  offsetof(struct kvmi, cmd_allow_mask),
  KVMI_NUM_COMMANDS);
diff --git a/virt/kvm/kvmi_msg.c b/virt/kvm/kvmi_msg.c
index af6bc47dc031..6fe04de29f7e 100644
--- a/virt/kvm/kvmi_msg.c
+++ b/virt/kvm/kvmi_msg.c
@@ -9,6 +9,7 @@
 #include "kvmi_int.h"
 
 static const char *const msg_IDs[] = {
+   [KVMI_GET_VERSION]   = "KVMI_GET_VERSION",
 };
 
 static bool is_known_message(u16 id)
@@ -129,6 +130,17 @@ static int kvmi_msg_vm_reply(struct kvmi *ikvm,
return kvmi_msg_reply(ikvm, msg, err, rpl, rpl_size);
 }
 
+static int handle_get_version(struct kvmi *ikvm,
+ const struct kvmi_msg_hdr *msg, const void *req)
+{
+   struct kvmi_get_version_reply rpl;
+
+   memset(, 0, sizeof(rpl));
+   rpl.version = KVMI_VERSION;
+
+   return kvmi_msg_vm_reply(ikvm, msg, 0, , sizeof(rpl));
+}
+
 static bool is_command_allowed(struct kvmi *ikvm, int id)
 {
return test_bit(id, ikvm->cmd_allow_mask);
@@ -139,6 +151,7 @@ static bool is_command_allowed(struct kvmi *ikvm, int id)
  */
 static int(*const msg_vm[])(struct kvmi *, const struct kvmi_msg_hdr *,
const void *) = {
+   [KVMI_GET_VERSION]   = handle_get_version,
 };
 
 static bool is_vm_message(u16 id)
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 45/92] kvm: introspection: add KVMI_GET_PAGE_WRITE_BITMAP

2019-08-09 Thread Adalbert Lazăr

This command returns subpage protection (SPP) write bitmaps for an array
of guest physical addresses of 4KB size.

Like the KVMI_GET_PAGE_ACCESS command, it checks only the radix tree,
not the SPP tables.  So, either we change it to check the SPP tables
or we drop it. Given the fact that the KVMI_EVENT_PF events are filter
using the radix tree and that the introspection tool should know what
it tracks, we should choose the later.

Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 44 ++
 arch/x86/kvm/kvmi.c| 44 ++
 include/uapi/linux/kvmi.h  | 11 
 virt/kvm/kvmi.c| 11 
 virt/kvm/kvmi_int.h| 11 
 virt/kvm/kvmi_msg.c| 18 
 6 files changed, 139 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index c1d12aaa8633..2ffb92b0fa71 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -650,6 +650,50 @@ If SPP is not enabled, *KVMI_GET_PAGE_WRITE_BITMAP* and
 * -KVM_EOPNOTSUPP - the hardware doesn't support SPP
 * -KVM_EOPNOTSUPP - the current implementation can't disable SPP
 
+12. KVMI_GET_PAGE_WRITE_BITMAP
+--
+
+:Architectures: x86
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_get_page_write_bitmap {
+   __u16 view;
+   __u16 count;
+   __u32 padding;
+   __u64 gpa[0];
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+   struct kvmi_get_page_write_bitmap_reply {
+   __u32 bitmap[0];
+   };
+
+Returns subpage protection (SPP) write bitmaps for an array of ``count``
+guest physical addresses of 4KB size.
+
+By default, for any guest physical address, the returned bits will be zero
+(no write access for any subpage if the *KVMI_PAGE_ACCESS_W* flag has been
+cleared for the whole 4KB page - see *KVMI_SET_PAGE_ACCESS*).
+
+On Intel hardware with multiple EPT views, the ``view`` argument selects the
+EPT view (0 is primary). On all other hardware it must be zero.
+
+:Errors:
+
+* -KVM_EINVAL - the selected SPT view is invalid
+* -KVM_EINVAL - padding is not zero
+* -KVM_EOPNOTSUPP - a SPT view was selected but the hardware doesn't support it
+* -KVM_EOPNOTSUPP - the hardware doesn't support SPP or hasn't been enabled
+* -KVM_EAGAIN - the selected vCPU can't be introspected yet
+* -KVM_ENOMEM - not enough memory to allocate the reply
+
 Events
 ==
 
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 01fd218e213c..356ec79936b3 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -224,6 +224,50 @@ int kvmi_arch_cmd_get_page_access(struct kvmi *ikvm,
return 0;
 }
 
+int kvmi_arch_cmd_get_page_write_bitmap(struct kvmi *ikvm,
+   const struct kvmi_msg_hdr *msg,
+   const struct kvmi_get_page_write_bitmap
+   *req,
+   struct kvmi_get_page_write_bitmap_reply
+   **dest, size_t *dest_size)
+{
+   struct kvmi_get_page_write_bitmap_reply *rpl = NULL;
+   size_t rpl_size = 0;
+   u16 k, n = req->count;
+   int ec = 0;
+
+   if (req->padding)
+   return -KVM_EINVAL;
+
+   if (msg->size < sizeof(*req) + req->count * sizeof(req->gpa[0]))
+   return -KVM_EINVAL;
+
+   if (!kvmi_spp_enabled(ikvm))
+   return -KVM_EOPNOTSUPP;
+
+   if (req->view != 0) /* TODO */
+   return -KVM_EOPNOTSUPP;
+
+   rpl_size = sizeof(*rpl) + sizeof(rpl->bitmap[0]) * n;
+   rpl = kvmi_msg_alloc_check(rpl_size);
+   if (!rpl)
+   return -KVM_ENOMEM;
+
+   for (k = 0; k < n && ec == 0; k++)
+   ec = kvmi_cmd_get_page_write_bitmap(ikvm, req->gpa[k],
+   >bitmap[k]);
+
+   if (ec) {
+   kvmi_msg_free(rpl);
+   return ec;
+   }
+
+   *dest = rpl;
+   *dest_size = rpl_size;
+
+   return 0;
+}
+
 int kvmi_arch_cmd_set_page_access(struct kvmi *ikvm,
  const struct kvmi_msg_hdr *msg,
  const struct kvmi_set_page_access *req)
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index 9f2b13718e47..19a6a50df96b 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -149,6 +149,17 @@ struct kvmi_control_spp {
__u32 padding3;
 };
 
+struct kvmi_get_page_write_bitmap {
+   __u16 view;
+   __u16 count;
+   __u32 padding;
+   __u64 gpa[0];
+};
+
+struct kvmi_get_page_write_bitmap_reply {
+   __u32 bitmap[0];
+};
+
 struct kvmi_get_vcpu_info_reply {
__u64 tsc_speed;
 };
diff --g

[RFC PATCH v6 62/92] kvm: introspection: add KVMI_EVENT_HYPERCALL

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This event is sent on a specific user hypercall.

It is used by the code residing inside the introspected guest to call the
introspection tool and to report certain details about its operation. For
example, a classic antimalware remediation tool can report what it has
found during a scan.

Signed-off-by: Mihai Donțu 
Co-developed-by: Adalbert Lazăr 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/hypercalls.txt | 34 +++-
 Documentation/virtual/kvm/kvmi.rst   | 31 +
 arch/x86/kvm/kvmi.c  | 33 +++
 arch/x86/kvm/x86.c   | 16 ---
 include/linux/kvmi.h |  2 ++
 include/uapi/linux/kvm_para.h|  2 ++
 virt/kvm/kvmi.c  | 22 +++
 virt/kvm/kvmi_int.h  |  3 +++
 virt/kvm/kvmi_msg.c  | 12 +
 9 files changed, 151 insertions(+), 4 deletions(-)

diff --git a/Documentation/virtual/kvm/hypercalls.txt 
b/Documentation/virtual/kvm/hypercalls.txt
index da24c138c8d1..1ab59537b2fb 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -122,7 +122,7 @@ compute the CLOCK_REALTIME for its clock, at the same 
instant.
 Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource,
 or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK.
 
-6. KVM_HC_SEND_IPI
+7. KVM_HC_SEND_IPI
 
 Architecture: x86
 Status: active
@@ -141,3 +141,35 @@ a0 corresponds to the APIC ID in the third argument (a2), 
bit 1
 corresponds to the APIC ID a2+1, and so on.
 
 Returns the number of CPUs to which the IPIs were delivered successfully.
+
+8. KVM_HC_XEN_HVM_OP
+
+
+Architecture: x86
+Status: active
+Purpose: To enable communication between a guest agent and a VMI application
+Usage:
+
+An event will be sent to the VMI application (see kvmi.rst) if the following
+registers, which differ between 32bit and 64bit, have the following values:
+
+   32bit   64bit value
+   ---
+   ebx (a0)rdi   KVM_HC_XEN_HVM_OP_GUEST_REQUEST_VM_EVENT
+   ecx (a1)rsi   0
+
+This specification copies Xen's { __HYPERVISOR_hvm_op,
+HVMOP_guest_request_vm_event } hypercall and can originate from kernel or
+userspace.
+
+It returns 0 if successful, or a negative POSIX.1 error code if it fails. The
+absence of an active VMI application is not signaled in any way.
+
+The following registers are clobbered:
+
+  * 32bit: edx, esi, edi, ebp
+  * 64bit: rdx, r10, r8, r9
+
+In particular, for KVM_HC_XEN_HVM_OP_GUEST_REQUEST_VM_EVENT, the last two
+registers can be poisoned deliberately and cannot be used for passing
+information.
diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index da216415bf32..2603813d1ee6 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -1505,3 +1505,34 @@ trying to perform a certain operation (like creating a 
process).
 ``kvmi_event`` and the guest physical address are sent to the introspector.
 
 The *RETRY* action is used by the introspector for its own breakpoints.
+
+10. KVMI_EVENT_HYPERCALL
+
+
+:Architectures: x86
+:Versions: >= 1
+:Actions: CONTINUE, CRASH
+:Parameters:
+
+::
+
+   struct kvmi_event;
+
+:Returns:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_event_reply;
+
+This event is sent on a specific user hypercall when the introspection has
+been enabled for this event (see *KVMI_CONTROL_EVENTS*).
+
+The hypercall number must be ``KVM_HC_XEN_HVM_OP`` with the
+``KVM_HC_XEN_HVM_OP_GUEST_REQUEST_VM_EVENT`` sub-function
+(see hypercalls.txt).
+
+It is used by the code residing inside the introspected guest to call the
+introspection tool and to report certain details about its operation. For
+example, a classic antimalware remediation tool can report what it has
+found during a scan.
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index e998223bca1e..02e026ef5ed7 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -448,6 +448,39 @@ void kvmi_arch_breakpoint_event(struct kvm_vcpu *vcpu, u64 
gva, u8 insn_len)
}
 }
 
+#define KVM_HC_XEN_HVM_OP_GUEST_REQUEST_VM_EVENT 24
+bool kvmi_arch_is_agent_hypercall(struct kvm_vcpu *vcpu)
+{
+   unsigned long subfunc1, subfunc2;
+   bool longmode = is_64_bit_mode(vcpu);
+
+   if (longmode) {
+   subfunc1 = kvm_register_read(vcpu, VCPU_REGS_RDI);
+   subfunc2 = kvm_register_read(vcpu, VCPU_REGS_RSI);
+   } else {
+   subfunc1 = kvm_register_read(vcpu, VCPU_REGS_RBX);
+   subfunc1 &= 0x;
+   subfunc2 = kvm_register_read(vcpu, VCPU_REGS_RCX);
+   subfunc2 &= 0x;
+   }
+
+   return (subfunc1 == KVM_HC_XEN_HVM_OP_GUEST_REQUE

[RFC PATCH v6 49/92] kvm: introspection: add KVMI_PAUSE_VCPU and KVMI_EVENT_PAUSE_VCPU

2019-08-09 Thread Adalbert Lazăr

This is the only vCPU command handled by the receiving worker.
It increments a pause request counter and kicks the vCPU.

This event is send by the vCPU thread, but has a low priority. It
will be sent after any other vCPU introspection event and when no vCPU
introspection command is queued.

Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 68 ++
 include/uapi/linux/kvm_para.h  |  1 +
 include/uapi/linux/kvmi.h  |  7 +++
 virt/kvm/kvmi.c| 65 
 virt/kvm/kvmi_int.h|  4 ++
 virt/kvm/kvmi_msg.c| 61 +++
 6 files changed, 206 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index eef32107837a..558d3eb6007f 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -820,6 +820,48 @@ one page (offset + size <= PAGE_SIZE).
 
 * -KVM_EINVAL - the specified gpa is invalid
 
+16. KVMI_PAUSE_VCPU
+---
+
+:Architecture: all
+:Versions: >= 1
+:Parameters:
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_pause_vcpu {
+   __u8 wait;
+   __u8 padding1;
+   __u16 padding2;
+   __u32 padding3;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+
+Kicks the vCPU from guest.
+
+If `wait` is 1, the command will wait for vCPU to acknowledge the IPI.
+
+The vCPU will handle the pending commands/events and send the
+*KVMI_EVENT_PAUSE_VCPU* event (one for every successful *KVMI_PAUSE_VCPU*
+command) before returning to guest.
+
+Please note that new vCPUs might by created at any time.
+The introspection tool should use *KVMI_CONTROL_VM_EVENTS* to enable the
+*KVMI_EVENT_CREATE_VCPU* event in order to stop these new vCPUs as well
+(by delaying the event reply).
+
+:Errors:
+
+* -KVM_EINVAL - the selected vCPU is invalid
+* -KVM_EINVAL - padding is not zero
+* -KVM_EAGAIN - the selected vCPU can't be introspected yet
+* -KVM_EBUSY  - the selected vCPU has too many queued *KVMI_EVENT_PAUSE_VCPU* 
events
+* -KVM_EPERM  - the *KVMI_EVENT_PAUSE_VCPU* event is disallowed (see 
*KVMI_CONTROL_EVENTS*)
+   and the introspection tool expects a reply.
 Events
 ==
 
@@ -992,3 +1034,29 @@ The *RETRY* action is used by the introspector to retry 
the execution of
 the current instruction. Either using single-step (if ``singlestep`` is
 not zero) or return to guest (if the introspector changed the instruction
 pointer or the page restrictions).
+
+4. KVMI_EVENT_PAUSE_VCPU
+
+
+:Architectures: all
+:Versions: >= 1
+:Actions: CONTINUE, CRASH
+:Parameters:
+
+::
+
+   struct kvmi_event;
+
+:Returns:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_event_reply;
+
+This event is sent in response to a *KVMI_PAUSE_VCPU* command and
+cannot be disabled via *KVMI_CONTROL_EVENTS*.
+
+This event has a low priority. It will be sent after any other vCPU
+introspection event and when no vCPU introspection command is queued.
+
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index 54c0e20f5b64..07e3f2662b36 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -18,6 +18,7 @@
 #define KVM_EPERM  EPERM
 #define KVM_EOPNOTSUPP 95
 #define KVM_EAGAIN 11
+#define KVM_EBUSY  EBUSY
 #define KVM_ENOMEM ENOMEM
 
 #define KVM_HC_VAPIC_POLL_IRQ  1
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index be3f066f314e..ca9c6b6aeed5 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -177,6 +177,13 @@ struct kvmi_get_vcpu_info_reply {
__u64 tsc_speed;
 };
 
+struct kvmi_pause_vcpu {
+   __u8 wait;
+   __u8 padding1;
+   __u16 padding2;
+   __u32 padding3;
+};
+
 struct kvmi_control_events {
__u16 event_id;
__u8 enable;
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index a84eb150e116..85de2da3eb7b 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -11,6 +11,8 @@
 #include 
 #include 
 
+#define MAX_PAUSE_REQUESTS 1001
+
 static struct kmem_cache *msg_cache;
 static struct kmem_cache *radix_cache;
 static struct kmem_cache *job_cache;
@@ -1090,6 +1092,39 @@ static bool kvmi_create_vcpu_event(struct kvm_vcpu *vcpu)
return ret;
 }
 
+static bool __kvmi_pause_vcpu_event(struct kvm_vcpu *vcpu)
+{
+   u32 action;
+   bool ret = false;
+
+   action = kvmi_msg_send_pause_vcpu(vcpu);
+   switch (action) {
+   case KVMI_EVENT_ACTION_CONTINUE:
+   ret = true;
+   break;
+   default:
+   kvmi_handle_common_event_actions(vcpu, action, "PAUSE");
+   }
+
+   return ret;
+}
+
+static bool kvmi_pause_vcpu_event(struct kvm_vcpu *vcpu)
+{
+   struct kvmi *ikvm;
+   bool ret = true;
+
+   ikvm = kvmi_get(vcpu->k

[RFC PATCH v6 38/92] KVM: VMX: Add init/set/get functions for SPP

2019-08-09 Thread Adalbert Lazăr

From: Yang Weijiang 

init_spp() must be called before {get, set}_subpage
functions, it creates subpage access bitmaps for memory pages
and issues a KVM request to setup SPPT root pages.

kvm_mmu_set_subpages() is to enable SPP bit in EPT leaf page
and setup corresponding SPPT entries. The mmu_lock
is held before above operation. If it's called in EPT fault and
SPPT mis-config induced handler, mmu_lock is acquired outside
the function, otherwise, it's acquired inside it.

kvm_mmu_get_subpages() is used to query access bitmap for
protected page, it's also used in EPT fault handler to check
whether the fault EPT page is SPP protected as well.

Co-developed-by: He Chen 
Signed-off-by: He Chen 
Co-developed-by: Zhang Yi 
Signed-off-by: Zhang Yi 
Co-developed-by: Yang Weijiang 
Signed-off-by: Yang Weijiang 
Message-Id: <20190717133751.12910-6-weijiang.y...@intel.com>
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/kvm_host.h |  18 
 arch/x86/include/asm/vmx.h  |   2 +
 arch/x86/kvm/mmu.c  | 160 
 arch/x86/kvm/vmx/vmx.c  |  48 ++
 arch/x86/kvm/x86.c  |  57 
 include/linux/kvm_host.h|   3 +
 include/uapi/linux/kvm.h|   9 ++
 7 files changed, 297 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f0878631b12a..7ee6e1ff5ee9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -399,8 +399,13 @@ struct kvm_mmu {
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
   u64 *spte, const void *pte);
+   int (*get_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
+   int (*set_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
+   int (*init_spp)(struct kvm *kvm);
+
hpa_t root_hpa;
gpa_t root_cr3;
+   hpa_t sppt_root;
union kvm_mmu_role mmu_role;
u8 root_level;
u8 shadow_root_level;
@@ -929,6 +934,8 @@ struct kvm_arch {
 
bool guest_can_read_msr_platform_info;
bool exception_payload_enabled;
+
+   bool spp_active;
 };
 
 struct kvm_vm_stat {
@@ -1202,6 +1209,11 @@ struct kvm_x86_ops {
int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu,
   uint16_t *vmcs_version);
uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
+
+   bool (*get_spp_status)(void);
+   int (*get_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
+   int (*set_subpages)(struct kvm *kvm, struct kvm_subpage *spp_info);
+   int (*init_spp)(struct kvm *kvm);
 };
 
 struct kvm_arch_async_pf {
@@ -1420,6 +1432,12 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool 
skip_tlb_flush);
 
+int kvm_mmu_get_subpages(struct kvm *kvm, struct kvm_subpage *spp_info,
+bool mmu_locked);
+int kvm_mmu_set_subpages(struct kvm *kvm, struct kvm_subpage *spp_info,
+bool mmu_locked);
+int kvm_mmu_init_spp(struct kvm *kvm);
+
 void kvm_enable_tdp(void);
 void kvm_disable_tdp(void);
 
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index a2c9e18e0ad7..6cb05ac07453 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -224,6 +224,8 @@ enum vmcs_field {
XSS_EXIT_BITMAP_HIGH= 0x202D,
ENCLS_EXITING_BITMAP= 0x202E,
ENCLS_EXITING_BITMAP_HIGH   = 0x202F,
+   SPPT_POINTER= 0x2030,
+   SPPT_POINTER_HIGH   = 0x2031,
TSC_MULTIPLIER  = 0x2032,
TSC_MULTIPLIER_HIGH = 0x2033,
GUEST_PHYSICAL_ADDRESS  = 0x2400,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f2774bbcfeed..38e79210d010 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3846,6 +3846,9 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct 
kvm_mmu *mmu,
(mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
mmu_free_root_page(vcpu->kvm, >root_hpa,
   _list);
+   if (vcpu->kvm->arch.spp_active)
+   mmu_free_root_page(vcpu->kvm, >sppt_root,
+  _list);
} else {
for (i = 0; i < 4; ++i)
if (mmu->pae_root[i] != 0)
@@ -4510,6 +4513,158 @@ int kvm_mmu_setup_spp_structure(struct kvm_vcpu *vcpu,
return ret;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_setup_spp_structure);
+
+int kvm_mmu_init_spp(struct kvm *kvm)
+{
+   int i, ret;
+

[RFC PATCH v6 82/92] kvm: x86: emulate movq r, xmm

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This adds support for movq r, xmm. It introduces a new flag (GPRModRM)
to indicate decode_modrm() that the encoded register is a general purpose
one.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/emulate.c | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2297955d0934..7c79504e58cd 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -172,6 +172,7 @@
 #define NoMod  ((u64)1 << 47)  /* Mod field is ignored */
 #define Intercept   ((u64)1 << 48)  /* Has valid intercept field */
 #define CheckPerm   ((u64)1 << 49)  /* Has valid check_perm field */
+#define GPRModRM((u64)1 << 50)  /* The ModRM encoded register is a GP one 
*/
 #define PrivUD  ((u64)1 << 51)  /* #UD instead of #GP on CPL > 0 */
 #define NearBranch  ((u64)1 << 52)  /* Near branches */
 #define No16   ((u64)1 << 53)  /* No 16 bit operand */
@@ -1197,6 +1198,11 @@ static u8 simd_prefix_to_bytes(const struct 
x86_emulate_ctxt *ctxt,
if (simd_prefix == 0x66)
bytes = 8;
break;
+   case 0x6e:
+   /* movq r/m64, xmm */
+   if (simd_prefix == 0x66)
+   bytes = 8;
+   break;
default:
break;
}
@@ -1262,7 +1268,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
ctxt->d & ByteOp);
-   if (ctxt->d & Sse) {
+   if ((ctxt->d & Sse) && !(ctxt->d & GPRModRM)) {
op->type = OP_XMM;
op->bytes = ctxt->op_bytes;
op->addr.xmm = ctxt->modrm_rm;
@@ -4546,6 +4552,10 @@ static const struct gprefix pfx_0f_6f_0f_7f = {
I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
 };
 
+static const struct gprefix pfx_0f_6e_0f_7e = {
+   N, I(Sse, em_mov), N, N
+};
+
 static const struct instr_dual instr_dual_0f_2b = {
I(0, em_mov), N
 };
@@ -4807,7 +4817,8 @@ static const struct opcode twobyte_table[256] = {
N, N, N, N,
N, N, N, N,
N, N, N, N,
-   N, N, N, GP(SrcMem | DstReg | ModRM | Mov, _0f_6f_0f_7f),
+   N, N, GP(SrcMem | DstReg | ModRM | GPRModRM | Mov, _0f_6e_0f_7e),
+   GP(SrcMem | DstReg | ModRM | Mov, _0f_6f_0f_7f),
/* 0x70 - 0x7F */
N, N, N, N,
N, N, N, N,
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 43/92] kvm: introspection: add KVMI_CONTROL_SPP

2019-08-09 Thread Adalbert Lazăr

This command enables/disables subpage protection (SPP) for the current VM.

Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 33 ++
 arch/x86/kvm/kvmi.c|  4 
 include/uapi/linux/kvmi.h  |  7 +++
 virt/kvm/kvmi_int.h|  6 ++
 virt/kvm/kvmi_msg.c| 33 ++
 5 files changed, 83 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index b64a030507cf..c1d12aaa8633 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -617,6 +617,39 @@ In order to 'forget' an address, all the access bits 
('rwx') must be set.
 * -KVM_EAGAIN - the selected vCPU can't be introspected yet
 * -KVM_ENOMEM - not enough memory to add the page tracking structures
 
+11. KVMI_CONTROL_SPP
+
+
+:Architectures: x86/intel
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_control_spp {
+   __u8 enable;
+   __u8 padding1;
+   __u16 padding2;
+   __u32 padding3;
+   }
+
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+
+Enables/disables subpage protection (SPP) for the current VM.
+
+If SPP is not enabled, *KVMI_GET_PAGE_WRITE_BITMAP* and
+*KVMI_SET_PAGE_WRITE_BITMAP* commands will fail.
+
+:Errors:
+
+* -KVM_EINVAL - padding is not zero
+* -KVM_EOPNOTSUPP - the hardware doesn't support SPP
+* -KVM_EOPNOTSUPP - the current implementation can't disable SPP
+
 Events
 ==
 
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 3238ef176ad6..01fd218e213c 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -260,3 +260,7 @@ int kvmi_arch_cmd_set_page_access(struct kvmi *ikvm,
return ec;
 }
 
+int kvmi_arch_cmd_control_spp(struct kvmi *ikvm)
+{
+   return kvm_arch_init_spp(ikvm->kvm);
+}
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index 2ddbb1fea807..9f2b13718e47 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -142,6 +142,13 @@ struct kvmi_set_page_access {
struct kvmi_page_access_entry entries[0];
 };
 
+struct kvmi_control_spp {
+   __u8 enable;
+   __u8 padding1;
+   __u16 padding2;
+   __u32 padding3;
+};
+
 struct kvmi_get_vcpu_info_reply {
__u64 tsc_speed;
 };
diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h
index c54be93349b7..3f0c7a03b4a1 100644
--- a/virt/kvm/kvmi_int.h
+++ b/virt/kvm/kvmi_int.h
@@ -130,6 +130,11 @@ struct kvmi {
DECLARE_BITMAP(event_allow_mask, KVMI_NUM_EVENTS);
DECLARE_BITMAP(vm_ev_mask, KVMI_NUM_EVENTS);
 
+   struct {
+   bool initialized;
+   atomic_t enabled;
+   } spp;
+
bool cmd_reply_disabled;
 };
 
@@ -184,6 +189,7 @@ int kvmi_arch_cmd_get_page_access(struct kvmi *ikvm,
 int kvmi_arch_cmd_set_page_access(struct kvmi *ikvm,
  const struct kvmi_msg_hdr *msg,
  const struct kvmi_set_page_access *req);
+int kvmi_arch_cmd_control_spp(struct kvmi *ikvm);
 void kvmi_arch_setup_event(struct kvm_vcpu *vcpu, struct kvmi_event *ev);
 bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
u8 access);
diff --git a/virt/kvm/kvmi_msg.c b/virt/kvm/kvmi_msg.c
index c150e7bdd440..e501a807c8a2 100644
--- a/virt/kvm/kvmi_msg.c
+++ b/virt/kvm/kvmi_msg.c
@@ -25,6 +25,7 @@ static const char *const msg_IDs[] = {
[KVMI_CHECK_EVENT]   = "KVMI_CHECK_EVENT",
[KVMI_CONTROL_CMD_RESPONSE]  = "KVMI_CONTROL_CMD_RESPONSE",
[KVMI_CONTROL_EVENTS]= "KVMI_CONTROL_EVENTS",
+   [KVMI_CONTROL_SPP]   = "KVMI_CONTROL_SPP",
[KVMI_CONTROL_VM_EVENTS] = "KVMI_CONTROL_VM_EVENTS",
[KVMI_EVENT] = "KVMI_EVENT",
[KVMI_EVENT_REPLY]   = "KVMI_EVENT_REPLY",
@@ -300,6 +301,37 @@ static int kvmi_get_vcpu(struct kvmi *ikvm, unsigned int 
vcpu_idx,
return 0;
 }
 
+static bool enable_spp(struct kvmi *ikvm)
+{
+   if (!ikvm->spp.initialized) {
+   int err = kvmi_arch_cmd_control_spp(ikvm);
+
+   ikvm->spp.initialized = true;
+
+   if (!err)
+   atomic_set(>spp.enabled, true);
+   }
+
+   return atomic_read(>spp.enabled);
+}
+
+static int handle_control_spp(struct kvmi *ikvm,
+ const struct kvmi_msg_hdr *msg,
+ const void *_req)
+{
+   const struct kvmi_control_spp *req = _req;
+   int ec;
+
+   if (req->padding1 || req->padding2 || req->padding3)
+   ec = -KVM_EINVAL;
+   else if (req->enable && enable_spp(ikvm))
+   ec = 0;
+   else
+   ec = -KVM_EOPNOTSUPP;
+
+   return kvm

[RFC PATCH v6 07/92] kvm: introspection: honor the reply option when handling the KVMI_GET_VERSION command

2019-08-09 Thread Adalbert Lazăr

Obviously, the KVMI_GET_VERSION command must not be used when the command
reply is disabled by a previous KVMI_CONTROL_CMD_RESPONSE command.

This commit changes the code path in order to check the reply option
(enabled/disabled) before trying to reply to this command. If the command
reply is disabled it will return an error to the caller. In the end, the
receiving worker will finish and the introspection socket will be closed.

Signed-off-by: Adalbert Lazăr 
---
 virt/kvm/kvmi_msg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/kvmi_msg.c b/virt/kvm/kvmi_msg.c
index ea5c7e23669a..2237a6ed25f6 100644
--- a/virt/kvm/kvmi_msg.c
+++ b/virt/kvm/kvmi_msg.c
@@ -169,7 +169,7 @@ static int handle_get_version(struct kvmi *ikvm,
memset(, 0, sizeof(rpl));
rpl.version = KVMI_VERSION;
 
-   return kvmi_msg_vm_reply(ikvm, msg, 0, , sizeof(rpl));
+   return kvmi_msg_vm_maybe_reply(ikvm, msg, 0, , sizeof(rpl));
 }
 
 static bool is_command_allowed(struct kvmi *ikvm, int id)
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 89/92] kvm: x86: make lock cmpxchg r, r/m atomic

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

The current emulation takes place in two steps: the first does all the
actions that an cmpxchg would do, sets ZF and saves all results in a
temporary storage (the emulation context). It's the second step that
does the actual atomic operation (actually uses cmpxchg). The problem
with this approach is that steps one and two can observe different
values in memory and when that happens RAX and RFLAGS will have invalid
values when returning to the guest as emulator_cmpxchg_emulated() does
not set these.

This patch modifies the prototype of emulator_cmpxchg_emulated() so that
when cmpxchg fails, it returns in *old the current value. We also modify
em_cmpxchg() so that if the LOCK prefix is present we invoke
emulator_cmpxchg_emulated() directly and set RAX and RFLAGS. Note that we
also disable writeback as it is no longer needed.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/kvm_emulate.h |  2 +-
 arch/x86/kvm/emulate.c | 57 +++---
 arch/x86/kvm/x86.c | 48 ++---
 3 files changed, 89 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h 
b/arch/x86/include/asm/kvm_emulate.h
index 97cb592687cb..863c04561a37 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -178,7 +178,7 @@ struct x86_emulate_ops {
 */
int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt,
unsigned long addr,
-   const void *old,
+   void *old,
const void *new,
unsigned int bytes,
struct x86_exception *fault);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7261b94c6c00..dac4c0ca1ee3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1547,11 +1547,15 @@ static int segmented_cmpxchg(struct x86_emulate_ctxt 
*ctxt,
 {
int rc;
ulong linear;
+   unsigned char buf[16];
 
rc = linearize(ctxt, addr, size, true, );
if (rc != X86EMUL_CONTINUE)
return rc;
-   return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data,
+   if (size > sizeof(buf))
+   return X86EMUL_UNHANDLEABLE;
+   memcpy(buf, orig_data, size);
+   return ctxt->ops->cmpxchg_emulated(ctxt, linear, buf, data,
   size, >exception);
 }
 
@@ -1803,16 +1807,21 @@ static int __load_segment_descriptor(struct 
x86_emulate_ctxt *ctxt,
/* CS(RPL) <- CPL */
selector = (selector & 0xfffc) | cpl;
break;
-   case VCPU_SREG_TR:
+   case VCPU_SREG_TR: {
+   struct desc_struct buf;
+
if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
goto exception;
-   old_desc = seg_desc;
+   buf = old_desc = seg_desc;
seg_desc.type |= 2; /* busy */
-   ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, _desc, 
_desc,
- sizeof(seg_desc), 
>exception);
+   ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, ,
+ _desc,
+ sizeof(seg_desc),
+ >exception);
if (ret != X86EMUL_CONTINUE)
return ret;
break;
+   }
case VCPU_SREG_LDTR:
if (seg_desc.s || seg_desc.type != 2)
goto exception;
@@ -2384,6 +2393,44 @@ static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt)
 
 static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
 {
+   if (ctxt->lock_prefix) {
+   int rc;
+   ulong linear;
+   u64 old = reg_read(ctxt, VCPU_REGS_RAX);
+   u64 new = ctxt->src.val64;
+
+   /* disable writeback altogether */
+   ctxt->d &= ~SrcWrite;
+   ctxt->d |= NoWrite;
+
+   rc = linearize(ctxt, ctxt->dst.addr.mem, ctxt->dst.bytes, true,
+  );
+   if (rc != X86EMUL_CONTINUE)
+   return rc;
+
+   rc = ctxt->ops->cmpxchg_emulated(ctxt, linear, , ,
+ctxt->dst.bytes,
+>exception);
+
+   switch (rc) {
+   case X86EMUL_CONTINUE:
+   ctxt->eflags |= X86_EFLAGS_ZF;
+   break;
+   case X86EMUL_CMPXCHG_FAILED: {
+   u64 mask = BITMAP_LAST_WORD_MASK(ctxt->dst.bytes * 8);
+
+

[RFC PATCH v6 36/92] KVM: VMX: Implement functions for SPPT paging setup

2019-08-09 Thread Adalbert Lazăr

From: Yang Weijiang 

SPPT is a 4-level paging structure similar to EPT, when SPP is
kicked for target physical page, bit 61 of the corresponding
EPT enty will be flaged, then SPPT is traversed with the gfn to
build up entries, the leaf entry of SPPT contains the access
bitmap for subpages inside the target 4KB physical page, one bit
per 128-byte subpage.

SPPT entries are set up in below cases:
1. the EPT faulted page is SPP protected.
2. SPP mis-config induced vmexit is handled.
3. User configures SPP protected pages via SPP IOCTLs.

Co-developed-by: He Chen 
Signed-off-by: He Chen 
Co-developed-by: Zhang Yi 
Signed-off-by: Zhang Yi 
Co-developed-by: Yang Weijiang 
Signed-off-by: Yang Weijiang 
Message-Id: <20190717133751.12910-4-weijiang.y...@intel.com>
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/kvm_host.h |   7 +-
 arch/x86/kvm/mmu.c  | 207 
 arch/x86/kvm/mmu.h  |   1 +
 include/linux/kvm_host.h|   3 +
 4 files changed, 217 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f1b3d89a0430..c05984f39923 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -271,7 +271,8 @@ union kvm_mmu_page_role {
unsigned smap_andnot_wp:1;
unsigned ad_disabled:1;
unsigned guest_mode:1;
-   unsigned :6;
+   unsigned spp:1;
+   unsigned reserved:5;
 
/*
 * This is left at the top of the word so that
@@ -1410,6 +1411,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
   void *insn, int insn_len);
+
+int kvm_mmu_setup_spp_structure(struct kvm_vcpu *vcpu,
+   u32 access_map, gfn_t gfn);
+
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool 
skip_tlb_flush);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 810e3e5bd575..8a6287cd2be4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -206,6 +206,11 @@ static const union kvm_mmu_page_role mmu_base_role_mask = {
({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
 __shadow_walk_next(&(_walker), spte))
 
+#define for_each_shadow_spp_entry(_vcpu, _addr, _walker)\
+   for (shadow_spp_walk_init(&(_walker), _vcpu, _addr);\
+shadow_walk_okay(&(_walker));  \
+shadow_walk_next(&(_walker)))
+
 static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
@@ -505,6 +510,11 @@ static int is_shadow_present_pte(u64 pte)
return (pte != 0) && !is_mmio_spte(pte);
 }
 
+static int is_spp_shadow_present(u64 pte)
+{
+   return pte & PT_PRESENT_MASK;
+}
+
 static int is_large_pte(u64 pte)
 {
return pte & PT_PAGE_SIZE_MASK;
@@ -524,6 +534,11 @@ static bool is_executable_pte(u64 spte)
return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
 }
 
+static bool is_spp_spte(struct kvm_mmu_page *sp)
+{
+   return sp->role.spp;
+}
+
 static kvm_pfn_t spte_to_pfn(u64 pte)
 {
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -1751,6 +1766,87 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
return 0;
 }
 
+static bool __rmap_open_subpage_bit(struct kvm *kvm,
+   struct kvm_rmap_head *rmap_head)
+{
+   struct rmap_iterator iter;
+   bool flush = false;
+   u64 *sptep;
+   u64 spte;
+
+   for_each_rmap_spte(rmap_head, , sptep) {
+   /*
+* SPP works only when the page is write-protected
+* and SPP bit is set in EPT leaf entry.
+*/
+   flush |= spte_write_protect(sptep, false);
+   spte = *sptep | PT_SPP_MASK;
+   flush |= mmu_spte_update(sptep, spte);
+   }
+
+   return flush;
+}
+
+static int kvm_mmu_open_subpage_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn)
+{
+   struct kvm_rmap_head *rmap_head;
+   bool flush = false;
+
+   /*
+* SPP is only supported with 4KB level1 memory page, check
+* if the page is mapped in EPT leaf entry.
+*/
+   rmap_head = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
+
+   if (!rmap_head->val)
+   return -EFAULT;
+
+   flush |= __rmap_open_subpage_bit(kvm, rmap_head);
+
+   if (flush)
+   kvm_flush_remote_tlbs(kvm);
+
+   return 0;

[RFC PATCH v6 40/92] KVM: VMX: Handle SPP induced vmexit and page fault

2019-08-09 Thread Adalbert Lazăr

From: Yang Weijiang 

If write to subpage is not allowed, EPT violation is generated,
it's propagated to QEMU or VMI to handle.

If the target page is SPP protected, however SPPT missing is
encoutered while traversing with gfn, vmexit is generated so
that KVM can handle the issue. Any SPPT misconfig will be
propagated to QEMU or VMI.

A SPP specific bit(11) is added to exit_qualification and a new
exit reason(66) is introduced for SPP.

Co-developed-by: He Chen 
Signed-off-by: He Chen 
Co-developed-by: Zhang Yi 
Signed-off-by: Zhang Yi 
Co-developed-by: Yang Weijiang 
Signed-off-by: Yang Weijiang 
Message-Id: <20190717133751.12910-8-weijiang.y...@intel.com>
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/vmx.h  |  7 
 arch/x86/include/uapi/asm/vmx.h |  2 +
 arch/x86/kvm/mmu.c  | 17 
 arch/x86/kvm/vmx/vmx.c  | 71 +
 include/uapi/linux/kvm.h|  5 +++
 5 files changed, 102 insertions(+)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 6cb05ac07453..11ca64ced578 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -547,6 +547,13 @@ struct vmx_msr_entry {
 #define EPT_VIOLATION_EXECUTABLE   (1 << EPT_VIOLATION_EXECUTABLE_BIT)
 #define EPT_VIOLATION_GVA_TRANSLATED   (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
 
+/*
+ * Exit Qualifications for SPPT-Induced vmexits
+ */
+#define SPPT_INDUCED_EXIT_TYPE_BIT 11
+#define SPPT_INDUCED_EXIT_TYPE (1 << SPPT_INDUCED_EXIT_TYPE_BIT)
+#define SPPT_INTR_INFO_UNBLOCK_NMI INTR_INFO_UNBLOCK_NMI
+
 /*
  * VM-instruction error numbers
  */
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index f0b0c90dd398..ac67622bac5a 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -85,6 +85,7 @@
 #define EXIT_REASON_PML_FULL62
 #define EXIT_REASON_XSAVES  63
 #define EXIT_REASON_XRSTORS 64
+#define EXIT_REASON_SPP 66
 
 #define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -141,6 +142,7 @@
{ EXIT_REASON_ENCLS, "ENCLS" }, \
{ EXIT_REASON_RDSEED,"RDSEED" }, \
{ EXIT_REASON_PML_FULL,  "PML_FULL" }, \
+   { EXIT_REASON_SPP,   "SPP" }, \
{ EXIT_REASON_XSAVES,"XSAVES" }, \
{ EXIT_REASON_XRSTORS,   "XRSTORS" }
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 38e79210d010..d59108a3ebbf 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3692,6 +3692,19 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t 
gva, int level,
if ((error_code & PFERR_WRITE_MASK) &&
spte_can_locklessly_be_made_writable(spte))
{
+   /*
+* Record write protect fault caused by
+* Sub-page Protection, let VMI decide
+* the next step.
+*/
+   if (spte & PT_SPP_MASK) {
+   fault_handled = true;
+   vcpu->run->exit_reason = KVM_EXIT_SPP;
+   vcpu->run->spp.addr = gva;
+   kvm_skip_emulated_instruction(vcpu);
+   break;
+   }
+
new_spte |= PT_WRITABLE_MASK;
 
/*
@@ -5880,6 +5893,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, 
u64 error_code,
r = vcpu->arch.mmu->page_fault(vcpu, cr2,
   lower_32_bits(error_code),
   false);
+
+   if (vcpu->run->exit_reason == KVM_EXIT_SPP)
+   return 0;
+
WARN_ON(r == RET_PF_INVALID);
}
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a50dd2b9d438..5d4b61aaff9a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5335,6 +5335,76 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
return handle_nop(vcpu);
 }
 
+static int handle_spp(struct kvm_vcpu *vcpu)
+{
+   unsigned long exit_qualification;
+   struct kvm_memory_slot *slot;
+   gpa_t gpa;
+   gfn_t gfn;
+
+   exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+   /*
+* SPP VM exit happened while executing iret from NMI,
+* "blocked by NMI" bit has to be set before next VM entry.
+* There are errata that may cause this bit to not be set:
+* AAK134, BY25.
+*/
+   if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+   (

[RFC PATCH v6 66/92] kvm: introspection: add custom input when single-stepping a vCPU

2019-08-09 Thread Adalbert Lazăr

The introspection tool can respond to a KVMI_EVENT_PF event with custom
input for the current instruction. This input is used to trick the guest
software into believing it has read certain data, in order to hide the
content of certain memory areas (eg. hide injected code from integrity
checkers). There are cases when this can happen while the vCPU has to
be single stepped, Either the current instruction is not supported by
the KVM emulator or the introspection tool requested single-stepping.

This patch saves the old data, write the custom input, start the single
stepping and restore the old data.

Signed-off-by: Adalbert Lazăr 
---
 virt/kvm/kvmi.c | 119 
 virt/kvm/kvmi_int.h |   3 ++
 2 files changed, 122 insertions(+)

diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index 3dfedf3ae739..06dc23f40ded 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -1618,6 +1618,116 @@ int kvmi_cmd_pause_vcpu(struct kvm_vcpu *vcpu, bool 
wait)
return 0;
 }
 
+static int write_custom_data_to_page(struct kvm_vcpu *vcpu, gva_t gva,
+   u8 *backup, size_t bytes)
+{
+   u8 *ptr_page, *ptr;
+   struct page *page;
+   gpa_t gpa;
+
+   gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+   if (gpa == UNMAPPED_GVA)
+   return -KVM_EINVAL;
+
+   ptr_page = get_page_ptr(vcpu->kvm, gpa, , true);
+   if (!ptr_page)
+   return -KVM_EINVAL;
+
+   ptr = ptr_page + (gpa & ~PAGE_MASK);
+
+   memcpy(backup, ptr, bytes);
+   use_custom_input(vcpu, gva, ptr, bytes);
+
+   put_page_ptr(ptr_page, page);
+
+   return 0;
+}
+
+static int write_custom_data(struct kvm_vcpu *vcpu)
+{
+   struct kvmi *ikvm = IKVM(vcpu->kvm);
+   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
+   size_t bytes = ivcpu->ctx_size;
+   gva_t gva = ivcpu->ctx_addr;
+   u8 *backup;
+
+   if (ikvm->ss_custom_size)
+   return 0;
+
+   if (!bytes)
+   return 0;
+
+   backup = ikvm->ss_custom_data;
+
+   while (bytes) {
+   size_t offset = gva & ~PAGE_MASK;
+   size_t chunk = min(bytes, PAGE_SIZE - offset);
+
+   if (write_custom_data_to_page(vcpu, gva, backup, chunk))
+   return -KVM_EINVAL;
+
+   bytes -= chunk;
+   backup += chunk;
+   gva += chunk;
+   ikvm->ss_custom_size += chunk;
+   }
+
+   return 0;
+}
+
+static int restore_backup_data_to_page(struct kvm_vcpu *vcpu, gva_t gva,
+   u8 *src, size_t bytes)
+{
+   u8 *ptr_page, *ptr;
+   struct page *page;
+   gpa_t gpa;
+
+   gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+   if (gpa == UNMAPPED_GVA)
+   return -KVM_EINVAL;
+
+   ptr_page = get_page_ptr(vcpu->kvm, gpa, , true);
+   if (!ptr_page)
+   return -KVM_EINVAL;
+
+   ptr = ptr_page + (gpa & ~PAGE_MASK);
+
+   memcpy(ptr, src, bytes);
+
+   put_page_ptr(ptr_page, page);
+
+   return 0;
+}
+
+static void restore_backup_data(struct kvm_vcpu *vcpu)
+{
+   struct kvmi *ikvm = IKVM(vcpu->kvm);
+   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
+   size_t bytes = ikvm->ss_custom_size;
+   gva_t gva = ivcpu->ctx_addr;
+   u8 *backup;
+
+   if (!bytes)
+   return;
+
+   backup = ikvm->ss_custom_data;
+
+   while (bytes) {
+   size_t offset = gva & ~PAGE_MASK;
+   size_t chunk = min(bytes, PAGE_SIZE - offset);
+
+   if (restore_backup_data_to_page(vcpu, gva, backup, chunk))
+   goto out;
+
+   bytes -= chunk;
+   backup += chunk;
+   gva += chunk;
+   }
+
+out:
+   ikvm->ss_custom_size = 0;
+}
+
 void kvmi_stop_ss(struct kvm_vcpu *vcpu)
 {
struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
@@ -1642,6 +1752,8 @@ void kvmi_stop_ss(struct kvm_vcpu *vcpu)
 
ikvm->ss_level = 0;
 
+   restore_backup_data(vcpu);
+
kvmi_arch_stop_single_step(vcpu);
 
atomic_set(>ss_active, false);
@@ -1676,6 +1788,7 @@ static bool kvmi_acquire_ss(struct kvm_vcpu *vcpu)
KVM_REQUEST_WAIT);
 
ivcpu->ss_owner = true;
+   ikvm->ss_custom_size = 0;
 
return true;
 }
@@ -1690,6 +1803,12 @@ static bool kvmi_run_ss(struct kvm_vcpu *vcpu, gpa_t 
gpa, u8 access)
 
kvmi_arch_start_single_step(vcpu);
 
+   err = write_custom_data(vcpu);
+   if (err) {
+   kvmi_err(ikvm, "writing custom data failed, err %d\n", err);
+   return false;
+   }
+
err = kvmi_get_gfn_access(ikvm, gfn, _access, _write_bitmap);
/* likely was removed from radix tree due to rwx */
if (err) {
diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_

[RFC PATCH v6 08/92] kvm: introspection: add KVMI_CHECK_COMMAND and KVMI_CHECK_EVENT

2019-08-09 Thread Adalbert Lazăr

These commands can be used by the introspection tool to check what
introspection commands and events are supported (by KVMi) and allowed
(by userspace/QEMU).

The introspection tool will get one of the following error codes:
  * -KVM_EOPNOTSUPP (unsupported command/event)
  * -KVM_PERM (disallowed command/event)
  * -KVM_EINVAL (the padding space, used for future extensions,
 is not zero)
  * 0 (the command/event is supported and allowed)

These commands can be seen as an alternative method to KVMI_GET_VERSION
in checking if the introspection supports a specific command/event.

As with the KVMI_GET_VERSION command, these commands can never be
disallowed by userspace/QEMU.

Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 60 ++
 include/uapi/linux/kvmi.h  | 12 ++
 virt/kvm/kvmi.c|  8 +++-
 virt/kvm/kvmi_msg.c| 38 +++
 4 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 82de474d512b..61cf69aa5d07 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -302,3 +302,63 @@ While the command reply is disabled:
 * the reply status is ignored for any unsupported/unknown or disallowed
   commands (and ``struct kvmi_error_code`` will be sent with -KVM_EOPNOTSUPP
   or -KVM_PERM).
+
+3. KVMI_CHECK_COMMAND
+-
+
+:Architectures: all
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_check_command {
+   __u16 id;
+   __u16 padding1;
+   __u32 padding2;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+
+Checks if the command specified by ``id`` is allowed.
+
+This command is always allowed.
+
+:Errors:
+
+* -KVM_PERM - the command specified by ``id`` is disallowed
+* -KVM_EINVAL - padding is not zero
+
+4. KVMI_CHECK_EVENT
+---
+
+:Architectures: all
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_check_event {
+   __u16 id;
+   __u16 padding1;
+   __u32 padding2;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+
+Checks if the event specified by ``id`` is allowed.
+
+This command is always allowed.
+
+:Errors:
+
+* -KVM_PERM - the event specified by ``id`` is disallowed
+* -KVM_EINVAL - padding is not zero
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index a1ab39c5b8e0..7390303371c9 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -90,4 +90,16 @@ struct kvmi_control_cmd_response {
__u32 padding2;
 };
 
+struct kvmi_check_command {
+   __u16 id;
+   __u16 padding1;
+   __u32 padding2;
+};
+
+struct kvmi_check_event {
+   __u16 id;
+   __u16 padding1;
+   __u32 padding2;
+};
+
 #endif /* _UAPI__LINUX_KVMI_H */
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index d5b6af21564e..dc1bb8326763 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -69,6 +69,8 @@ static bool alloc_kvmi(struct kvm *kvm, const struct 
kvm_introspection *qemu)
return false;
 
set_bit(KVMI_GET_VERSION, ikvm->cmd_allow_mask);
+   set_bit(KVMI_CHECK_COMMAND, ikvm->cmd_allow_mask);
+   set_bit(KVMI_CHECK_EVENT, ikvm->cmd_allow_mask);
 
memcpy(>uuid, >uuid, sizeof(ikvm->uuid));
 
@@ -295,10 +297,14 @@ int kvmi_ioctl_command(struct kvm *kvm, void __user *argp)
if (!allow) {
DECLARE_BITMAP(always_allowed, KVMI_NUM_COMMANDS);
 
-   if (id == KVMI_GET_VERSION)
+   if (id == KVMI_GET_VERSION
+   || id == KVMI_CHECK_COMMAND
+   || id == KVMI_CHECK_EVENT)
return -EPERM;
 
set_bit(KVMI_GET_VERSION, always_allowed);
+   set_bit(KVMI_CHECK_COMMAND, always_allowed);
+   set_bit(KVMI_CHECK_EVENT, always_allowed);
 
bitmap_andnot(requested, requested, always_allowed,
  KVMI_NUM_COMMANDS);
diff --git a/virt/kvm/kvmi_msg.c b/virt/kvm/kvmi_msg.c
index 2237a6ed25f6..e24996611e3a 100644
--- a/virt/kvm/kvmi_msg.c
+++ b/virt/kvm/kvmi_msg.c
@@ -9,6 +9,8 @@
 #include "kvmi_int.h"
 
 static const char *const msg_IDs[] = {
+   [KVMI_CHECK_COMMAND] = "KVMI_CHECK_COMMAND",
+   [KVMI_CHECK_EVENT]   = "KVMI_CHECK_EVENT",
[KVMI_CONTROL_CMD_RESPONSE]  = "KVMI_CONTROL_CMD_RESPONSE",
[KVMI_GET_VERSION]   = "KVMI_GET_VERSION",
 };
@@ -177,6 +179,40 @@ static bool is_command_allowed(struct kvmi *ikvm, int id)
return test_bit(id, ikvm->cmd_allow_mask);
 }
 
+static int handle_check_command(struct kvmi *ikvm,
+   const struct kvmi_msg_hdr *msg,
+   const void *_re

[RFC PATCH v6 09/92] kvm: introspection: add KVMI_GET_GUEST_INFO

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

For now, this command returns only the number of online vCPUs.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 18 ++
 include/uapi/linux/kvmi.h  |  5 +
 virt/kvm/kvmi_msg.c| 14 ++
 3 files changed, 37 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 61cf69aa5d07..2fbe7c28e4f1 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -362,3 +362,21 @@ This command is always allowed.
 
 * -KVM_PERM - the event specified by ``id`` is disallowed
 * -KVM_EINVAL - padding is not zero
+
+5. KVMI_GET_GUEST_INFO
+--
+
+:Architectures: all
+:Versions: >= 1
+:Parameters:: none
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+   struct kvmi_get_guest_info_reply {
+   __u32 vcpu_count;
+   __u32 padding[3];
+   };
+
+Returns the number of online vCPUs.
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index 7390303371c9..367c8ec28f75 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -102,4 +102,9 @@ struct kvmi_check_event {
__u32 padding2;
 };
 
+struct kvmi_get_guest_info_reply {
+   __u32 vcpu_count;
+   __u32 padding[3];
+};
+
 #endif /* _UAPI__LINUX_KVMI_H */
diff --git a/virt/kvm/kvmi_msg.c b/virt/kvm/kvmi_msg.c
index e24996611e3a..cf8a120b0eae 100644
--- a/virt/kvm/kvmi_msg.c
+++ b/virt/kvm/kvmi_msg.c
@@ -12,6 +12,7 @@ static const char *const msg_IDs[] = {
[KVMI_CHECK_COMMAND] = "KVMI_CHECK_COMMAND",
[KVMI_CHECK_EVENT]   = "KVMI_CHECK_EVENT",
[KVMI_CONTROL_CMD_RESPONSE]  = "KVMI_CONTROL_CMD_RESPONSE",
+   [KVMI_GET_GUEST_INFO]= "KVMI_GET_GUEST_INFO",
[KVMI_GET_VERSION]   = "KVMI_GET_VERSION",
 };
 
@@ -213,6 +214,18 @@ static int handle_check_event(struct kvmi *ikvm,
return kvmi_msg_vm_maybe_reply(ikvm, msg, ec, NULL, 0);
 }
 
+static int handle_get_guest_info(struct kvmi *ikvm,
+const struct kvmi_msg_hdr *msg,
+const void *req)
+{
+   struct kvmi_get_guest_info_reply rpl;
+
+   memset(, 0, sizeof(rpl));
+   rpl.vcpu_count = atomic_read(>kvm->online_vcpus);
+
+   return kvmi_msg_vm_maybe_reply(ikvm, msg, 0, , sizeof(rpl));
+}
+
 static int handle_control_cmd_response(struct kvmi *ikvm,
const struct kvmi_msg_hdr *msg,
const void *_req)
@@ -246,6 +259,7 @@ static int(*const msg_vm[])(struct kvmi *, const struct 
kvmi_msg_hdr *,
[KVMI_CHECK_COMMAND] = handle_check_command,
[KVMI_CHECK_EVENT]   = handle_check_event,
[KVMI_CONTROL_CMD_RESPONSE]  = handle_control_cmd_response,
+   [KVMI_GET_GUEST_INFO]= handle_get_guest_info,
[KVMI_GET_VERSION]   = handle_get_version,
 };
 
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 58/92] kvm: introspection: add KVMI_GET_MTRR_TYPE

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This command returns the memory type for a guest physical address.

Signed-off-by: Mihai Donțu 
Co-developed-by: Nicușor Cîțu 
Signed-off-by: Nicușor Cîțu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 32 ++
 arch/x86/include/uapi/asm/kvmi.h   |  9 +
 arch/x86/kvm/kvmi.c|  7 +++
 virt/kvm/kvmi_int.h|  1 +
 virt/kvm/kvmi_msg.c| 17 
 5 files changed, 66 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index c43ea1b33a51..e58f0e22f188 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -1112,6 +1112,38 @@ the buffer size from the message size.
 * -KVM_EAGAIN - the selected vCPU can't be introspected yet
 * -KVM_ENOMEM - not enough memory to allocate the reply
 
+24. KVMI_GET_MTRR_TYPE
+--
+
+:Architecture: x86
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_get_mtrr_type {
+   __u64 gpa;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+   struct kvmi_get_mtrr_type_reply {
+   __u8 type;
+   __u8 padding[7];
+   };
+
+Returns the guest memory type for a specific physical address.
+
+:Errors:
+
+* -KVM_EINVAL - the selected vCPU is invalid
+* -KVM_EINVAL - padding is not zero
+* -KVM_EAGAIN - the selected vCPU can't be introspected yet
+
 Events
 ==
 
diff --git a/arch/x86/include/uapi/asm/kvmi.h b/arch/x86/include/uapi/asm/kvmi.h
index a3fcb1ef8404..c3c96e6e2a26 100644
--- a/arch/x86/include/uapi/asm/kvmi.h
+++ b/arch/x86/include/uapi/asm/kvmi.h
@@ -101,4 +101,13 @@ struct kvmi_get_xsave_reply {
__u32 region[0];
 };
 
+struct kvmi_get_mtrr_type {
+   __u64 gpa;
+};
+
+struct kvmi_get_mtrr_type_reply {
+   __u8 type;
+   __u8 padding[7];
+};
+
 #endif /* _UAPI_ASM_X86_KVMI_H */
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 078d714b59d5..0114ed66f4f3 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -811,3 +811,10 @@ int kvmi_arch_cmd_get_xsave(struct kvm_vcpu *vcpu,
 
return 0;
 }
+
+int kvmi_arch_cmd_get_mtrr_type(struct kvm_vcpu *vcpu, u64 gpa, u8 *type)
+{
+   *type = kvm_mtrr_get_guest_memory_type(vcpu, gpa_to_gfn(gpa));
+
+   return 0;
+}
diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h
index 1a705cba4776..ac2e13787f01 100644
--- a/virt/kvm/kvmi_int.h
+++ b/virt/kvm/kvmi_int.h
@@ -267,5 +267,6 @@ int kvmi_arch_cmd_control_cr(struct kvm_vcpu *vcpu,
 const struct kvmi_control_cr *req);
 int kvmi_arch_cmd_control_msr(struct kvm_vcpu *vcpu,
  const struct kvmi_control_msr *req);
+int kvmi_arch_cmd_get_mtrr_type(struct kvm_vcpu *vcpu, u64 gpa, u8 *type);
 
 #endif
diff --git a/virt/kvm/kvmi_msg.c b/virt/kvm/kvmi_msg.c
index 6bc18b7973cf..ee54d92b07ec 100644
--- a/virt/kvm/kvmi_msg.c
+++ b/virt/kvm/kvmi_msg.c
@@ -33,6 +33,7 @@ static const char *const msg_IDs[] = {
[KVMI_EVENT_REPLY]   = "KVMI_EVENT_REPLY",
[KVMI_GET_CPUID] = "KVMI_GET_CPUID",
[KVMI_GET_GUEST_INFO]= "KVMI_GET_GUEST_INFO",
+   [KVMI_GET_MTRR_TYPE] = "KVMI_GET_MTRR_TYPE",
[KVMI_GET_PAGE_ACCESS]   = "KVMI_GET_PAGE_ACCESS",
[KVMI_GET_PAGE_WRITE_BITMAP] = "KVMI_GET_PAGE_WRITE_BITMAP",
[KVMI_GET_REGISTERS] = "KVMI_GET_REGISTERS",
@@ -701,6 +702,21 @@ static int handle_get_cpuid(struct kvm_vcpu *vcpu,
return reply_cb(vcpu, msg, ec, , sizeof(rpl));
 }
 
+static int handle_get_mtrr_type(struct kvm_vcpu *vcpu,
+   const struct kvmi_msg_hdr *msg,
+   const void *_req, vcpu_reply_fct reply_cb)
+{
+   const struct kvmi_get_mtrr_type *req = _req;
+   struct kvmi_get_mtrr_type_reply rpl;
+   int ec;
+
+   memset(, 0, sizeof(rpl));
+
+   ec = kvmi_arch_cmd_get_mtrr_type(vcpu, req->gpa, );
+
+   return reply_cb(vcpu, msg, ec, , sizeof(rpl));
+}
+
 static int handle_get_xsave(struct kvm_vcpu *vcpu,
const struct kvmi_msg_hdr *msg, const void *req,
vcpu_reply_fct reply_cb)
@@ -730,6 +746,7 @@ static int(*const msg_vcpu[])(struct kvm_vcpu *,
[KVMI_CONTROL_MSR]  = handle_control_msr,
[KVMI_EVENT_REPLY]  = handle_event_reply,
[KVMI_GET_CPUID]= handle_get_cpuid,
+   [KVMI_GET_MTRR_TYPE]= handle_get_mtrr_type,
[KVMI_GET_REGISTERS]= handle_get_registers,
[KVMI_GET_VCPU_INFO]= handle_get_vcpu_info,
[KVMI_GET_XSAVE]= handle_get_xsave,
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 84/92] kvm: x86: enable the half part of movss, movsd, movups

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

A previous patch added emulation support for these instructions with a
register source and memory destination. This patch adds the variants
with a memory source and a register destination.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/emulate.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b42a71653622..a2e5e63bd94a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1184,6 +1184,10 @@ static u8 simd_prefix_to_bytes(const struct 
x86_emulate_ctxt *ctxt,
u8 bytes = 16;
 
switch (ctxt->b) {
+   case 0x10:
+   /* movss m32, xmm */
+   /* movsd m64, xmm */
+   /* movups m128, xmm */
case 0x11:
/* movss xmm, m32 */
/* movsd xmm, m64 */
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 55/92] kvm: introspection: add KVMI_CONTROL_MSR and KVMI_EVENT_MSR

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

The KVMI_CONTROL_MSR is used to enable/disable introspection for a
specific MSR. The KVMI_EVENT_MSR is send when the tracked MSR is going
to be changed. The introspection tool can respond by allowing the guest
to continue with normal execution or by discarding the change.

This is meant to prevent malicious changes to MSR-s
such as MSR_IA32_SYSENTER_EIP.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst |  73 +
 arch/x86/include/asm/kvm_host.h|   4 +
 arch/x86/include/asm/kvmi_host.h   |   6 ++
 arch/x86/include/uapi/asm/kvmi.h   |  18 
 arch/x86/kvm/kvmi.c| 127 +
 arch/x86/kvm/svm.c |  15 
 arch/x86/kvm/vmx/vmx.c |  10 +++
 arch/x86/kvm/x86.c |  10 +++
 virt/kvm/kvmi_int.h|   8 +-
 virt/kvm/kvmi_msg.c|  13 +++
 10 files changed, 283 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 2e6e285c8e2e..c41c3edb0134 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -1042,6 +1042,45 @@ ID set.
 * -KVM_EINVAL - padding is not zero
 * -KVM_EAGAIN - the selected vCPU can't be introspected yet
 
+22. KVMI_CONTROL_MSR
+
+
+:Architectures: x86
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_control_msr {
+   __u8 enable;
+   __u8 padding1;
+   __u16 padding2;
+   __u32 msr;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code
+
+Enables/disables introspection for a specific MSR and must be used
+in addition to *KVMI_CONTROL_EVENTS* with the *KVMI_EVENT_MSR* ID set.
+
+Currently, only MSRs within the following two ranges are supported. Trying
+to control events for any other register will fail with -KVM_EINVAL::
+
+   0  ... 0x1fff
+   0xc000 ... 0xc0001fff
+
+:Errors:
+
+* -KVM_EINVAL - the selected vCPU is invalid
+* -KVM_EINVAL - the specified MSR is invalid
+* -KVM_EINVAL - padding is not zero
+* -KVM_EAGAIN - the selected vCPU can't be introspected yet
+
 Events
 ==
 
@@ -1308,3 +1347,37 @@ register (see **KVMI_CONTROL_EVENTS**).
 ``kvmi_event``, the control register number, the old value and the new value
 are sent to the introspector. The *CONTINUE* action will set the ``new_val``.
 
+7. KVMI_EVENT_MSR
+-
+
+:Architectures: x86
+:Versions: >= 1
+:Actions: CONTINUE, CRASH
+:Parameters:
+
+::
+
+   struct kvmi_event;
+   struct kvmi_event_msr {
+   __u32 msr;
+   __u32 padding;
+   __u64 old_value;
+   __u64 new_value;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_event_reply;
+   struct kvmi_event_msr_reply {
+   __u64 new_val;
+   };
+
+This event is sent when a model specific register is going to be changed
+and the introspection has been enabled for this event and for this specific
+register (see **KVMI_CONTROL_EVENTS**).
+
+``kvmi_event``, the MSR number, the old value and the new value are
+sent to the introspector. The *CONTINUE* action will set the ``new_val``.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 22f08f2732cc..91cd43a7a7bf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1013,6 +1013,8 @@ struct kvm_x86_ops {
bool (*has_emulated_msr)(int index);
void (*cpuid_update)(struct kvm_vcpu *vcpu);
 
+   void (*msr_intercept)(struct kvm_vcpu *vcpu, unsigned int msr,
+   bool enable);
void (*cr3_write_exiting)(struct kvm_vcpu *vcpu, bool enable);
bool (*nested_pagefault)(struct kvm_vcpu *vcpu);
bool (*spt_fault)(struct kvm_vcpu *vcpu);
@@ -1621,6 +1623,8 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
 #define put_smstate(type, buf, offset, val)  \
*(type *)((buf) + (offset) - 0x7e00) = val
 
+void kvm_arch_msr_intercept(struct kvm_vcpu *vcpu, unsigned int msr,
+   bool enable);
 bool kvm_mmu_nested_pagefault(struct kvm_vcpu *vcpu);
 bool kvm_spt_fault(struct kvm_vcpu *vcpu);
 void kvm_control_cr3_write_exiting(struct kvm_vcpu *vcpu, bool enable);
diff --git a/arch/x86/include/asm/kvmi_host.h b/arch/x86/include/asm/kvmi_host.h
index 83a098dc8939..8285d1eb0db6 100644
--- a/arch/x86/include/asm/kvmi_host.h
+++ b/arch/x86/include/asm/kvmi_host.h
@@ -11,11 +11,17 @@ struct kvmi_arch_mem_access {
 
 #ifdef CONFIG_KVM_INTROSPECTION
 
+bool kvmi_msr_event(struct kvm_vcpu *vcpu, struct msr_data *msr);
 bool kvmi_cr_event(struct kvm_vcpu *vcpu, unsigned int cr,
   unsigned long old_value, unsigned long *new_value);
 
 #else /* CONFIG_KVM_INTROSPECTION */
 
+static inline bool kvmi_msr_event(

[RFC PATCH v6 61/92] kvm: introspection: add KVMI_EVENT_BREAKPOINT

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This event is sent when a breakpoint was reached. It has to
be enabled with the KVMI_CONTROL_EVENTS command first.

The introspection tool can place breakpoints and use them as notification
for when the OS or an application has reached a certain state or is
trying to perform a certain operation (like creating a process).

Signed-off-by: Mihai Donțu 
Co-developed-by: Nicușor Cîțu 
Signed-off-by: Nicușor Cîțu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 36 +
 arch/x86/kvm/kvmi.c| 20 +++
 arch/x86/kvm/svm.c |  6 +++
 arch/x86/kvm/vmx/vmx.c | 17 --
 arch/x86/kvm/x86.c | 12 +
 include/linux/kvm_host.h   |  2 +
 include/linux/kvmi.h   |  7 +++
 include/uapi/linux/kvmi.h  |  6 +++
 virt/kvm/kvmi.c| 84 --
 virt/kvm/kvmi_int.h|  3 ++
 virt/kvm/kvmi_msg.c| 17 ++
 11 files changed, 201 insertions(+), 9 deletions(-)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 1d2431639770..da216415bf32 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -1469,3 +1469,39 @@ to be changed and the introspection has been enabled for 
this event
 (see *KVMI_CONTROL_EVENTS*).
 
 ``kvmi_event`` is sent to the introspector.
+
+9. KVMI_EVENT_BREAKPOINT
+
+
+:Architectures: x86
+:Versions: >= 1
+:Actions: CONTINUE, CRASH, RETRY
+:Parameters:
+
+::
+
+   struct kvmi_event;
+   struct kvmi_event_breakpoint {
+   __u64 gpa;
+   __u8 insn_len;
+   __u8 padding[7];
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_event_reply;
+
+This event is sent when a breakpoint was reached and the introspection has
+been enabled for this event (see *KVMI_CONTROL_EVENTS*).
+
+Some of these breakpoints could have been injected by the introspector,
+placed in the slack space of various functions and used as notification
+for when the OS or an application has reached a certain state or is
+trying to perform a certain operation (like creating a process).
+
+``kvmi_event`` and the guest physical address are sent to the introspector.
+
+The *RETRY* action is used by the introspector for its own breakpoints.
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 0e9c91d2f282..e998223bca1e 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -428,6 +428,26 @@ void kvmi_xsetbv_event(struct kvm_vcpu *vcpu)
kvmi_put(vcpu->kvm);
 }
 
+void kvmi_arch_breakpoint_event(struct kvm_vcpu *vcpu, u64 gva, u8 insn_len)
+{
+   u32 action;
+   u64 gpa;
+
+   gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+
+   action = kvmi_msg_send_bp(vcpu, gpa, insn_len);
+   switch (action) {
+   case KVMI_EVENT_ACTION_CONTINUE:
+   kvm_arch_queue_bp(vcpu);
+   break;
+   case KVMI_EVENT_ACTION_RETRY:
+   /* rip was most likely adjusted past the INT 3 instruction */
+   break;
+   default:
+   kvmi_handle_common_event_actions(vcpu, action, "BP");
+   }
+}
+
 bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
u8 access)
 {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e46a4c423545..b4e59ef040b7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -18,6 +18,7 @@
 #define pr_fmt(fmt) "SVM: " fmt
 
 #include 
+#include 
 #include 
 
 #include "irq.h"
@@ -2722,6 +2723,11 @@ static int bp_interception(struct vcpu_svm *svm)
 {
struct kvm_run *kvm_run = svm->vcpu.run;
 
+   if (!kvmi_breakpoint_event(>vcpu,
+   svm->vmcb->save.cs.base + svm->vmcb->save.rip,
+   svm->vmcb->control.insn_len))
+   return 1;
+
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
kvm_run->debug.arch.exception = BP_VECTOR;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index fff41adcdffe..d560b583bf30 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -4484,7 +4485,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_run *kvm_run = vcpu->run;
u32 intr_info, ex_no, error_code;
-   unsigned long cr2, rip, dr6;
+   unsigned long cr2, dr6;
u32 vect_info;
enum emulation_result er;
 
@@ -4562,7 +4563,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
kvm_run->debug.arch.dr7 = vmcs_readl(GUE

[RFC PATCH v6 87/92] kvm: x86: emulate xorps xmm/m128, xmm

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This extends the previous xorpd by creating a dedicated group, something
I should have done since the very beginning.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/emulate.c | 22 +-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 28aac552b34b..14895c043edc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1178,6 +1178,22 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
 }
 
+static int em_xorps(struct x86_emulate_ctxt *ctxt)
+{
+   const sse128_t *src = >src.vec_val;
+   sse128_t *dst = >dst.vec_val;
+   sse128_t xmm0;
+
+   asm volatile("movdqu %%xmm0, %0\n"
+"movdqu %1, %%xmm0\n"
+"xorps %2, %%xmm0\n"
+"movdqu %%xmm0, %1\n"
+"movdqu %0, %%xmm0"
+: "+m"(xmm0), "+m"(*dst) : "m"(*src));
+
+   return X86EMUL_CONTINUE;
+}
+
 static int em_xorpd(struct x86_emulate_ctxt *ctxt)
 {
const sse128_t *src = >src.vec_val;
@@ -4615,6 +4631,10 @@ static const struct gprefix pfx_0f_e7 = {
N, I(Sse, em_mov), N, N,
 };
 
+static const struct gprefix pfx_0f_57 = {
+   I(Unaligned, em_xorps), I(Unaligned, em_xorpd), N, N
+};
+
 static const struct escape escape_d9 = { {
N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
 }, {
@@ -4847,7 +4867,7 @@ static const struct opcode twobyte_table[256] = {
/* 0x40 - 0x4F */
X16(D(DstReg | SrcMem | ModRM)),
/* 0x50 - 0x5F */
-   N, N, N, N, N, N, N, I(SrcMem | DstReg | ModRM | Unaligned | Sse, 
em_xorpd),
+   N, N, N, N, N, N, N, GP(SrcMem | DstReg | ModRM | Sse, _0f_57),
N, N, N, N, N, N, N, N,
/* 0x60 - 0x6F */
N, N, N, N,
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 60/92] kvm: x86: add kvm_arch_vcpu_set_guest_debug()

2019-08-09 Thread Adalbert Lazăr

This function is need in order to intercept breakpoints and send
KVMI_EVENT_BREAKPOINT events to the introspection tool.

Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/x86.c   | 18 +-
 include/linux/kvm_host.h |  2 ++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 278a286ba262..e633f297e86d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8747,14 +8747,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
return ret;
 }
 
-int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
-   struct kvm_guest_debug *dbg)
+int kvm_arch_vcpu_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg)
 {
unsigned long rflags;
int i, r;
 
-   vcpu_load(vcpu);
-
if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
r = -EBUSY;
if (vcpu->arch.exception.pending)
@@ -8800,10 +8798,20 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu 
*vcpu,
r = 0;
 
 out:
-   vcpu_put(vcpu);
return r;
 }
 
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+   struct kvm_guest_debug *dbg)
+{
+   int ret;
+
+   vcpu_load(vcpu);
+   ret = kvm_arch_vcpu_set_guest_debug(vcpu, dbg);
+   vcpu_put(vcpu);
+   return ret;
+}
+
 /*
  * Translate a guest virtual address to a guest physical address.
  */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3aad3b96107b..691c24598b4d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -804,6 +804,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state);
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg);
+int kvm_arch_vcpu_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg);
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
  struct kvm_xsave *guest_xsave);
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 81/92] kvm: x86: emulate movq xmm, m64

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This is needed in order to be able to support guest code that uses movq to
write into pages that are marked for write tracking.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/emulate.c | 24 +++-
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b8a412b8b087..2297955d0934 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1180,23 +1180,24 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
 static u8 simd_prefix_to_bytes(const struct x86_emulate_ctxt *ctxt,
   int simd_prefix)
 {
-   u8 bytes;
+   u8 bytes = 16;
 
switch (ctxt->b) {
case 0x11:
/* movss xmm, m32 */
/* movsd xmm, m64 */
/* movups xmm, m128 */
-   if (simd_prefix == 0xf3) {
+   if (simd_prefix == 0xf3)
bytes = 4;
-   break;
-   } else if (simd_prefix == 0xf2) {
+   else if (simd_prefix == 0xf2)
bytes = 8;
-   break;
-   }
-   /* fallthrough */
+   break;
+   case 0xd6:
+   /* movq xmm, m64 */
+   if (simd_prefix == 0x66)
+   bytes = 8;
+   break;
default:
-   bytes = 16;
break;
}
return bytes;
@@ -4549,6 +4550,10 @@ static const struct instr_dual instr_dual_0f_2b = {
I(0, em_mov), N
 };
 
+static const struct gprefix pfx_0f_d6 = {
+   N, I(0, em_mov), N, N,
+};
+
 static const struct gprefix pfx_0f_2b = {
ID(0, _dual_0f_2b), ID(0, _dual_0f_2b), N, N,
 };
@@ -4846,7 +4851,8 @@ static const struct opcode twobyte_table[256] = {
/* 0xC8 - 0xCF */
X8(I(DstReg, em_bswap)),
/* 0xD0 - 0xDF */
-   N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+   N, N, N, N, N, N, GP(ModRM | SrcReg | DstMem | Mov | Sse, _0f_d6),
+   N, N, N, N, N, N, N, N, N,
/* 0xE0 - 0xEF */
N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, _0f_e7),
N, N, N, N, N, N, N, N,
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 70/92] kvm: x86: filter out access rights only when tracked by the introspection tool

2019-08-09 Thread Adalbert Lazăr

It should complete the commit fd34a9518173 ("kvm: x86: consult the page 
tracking from kvm_mmu_get_page() and __direct_map()")

Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/mmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 65b6acba82da..fd64cf1115da 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2660,6 +2660,9 @@ static void clear_sp_write_flooding_count(u64 *spte)
 static unsigned int kvm_mmu_page_track_acc(struct kvm_vcpu *vcpu, gfn_t gfn,
   unsigned int acc)
 {
+   if (!kvmi_tracked_gfn(vcpu, gfn))
+   return acc;
+
if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_PREREAD))
acc &= ~ACC_USER_MASK;
if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_PREWRITE) ||
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 88/92] kvm: x86: emulate fst/fstp m64fp

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This adds support for fst m64fp and fstp m64fp.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/emulate.c | 23 ++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 14895c043edc..7261b94c6c00 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1178,6 +1178,26 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
 }
 
+static int em_fstp(struct x86_emulate_ctxt *ctxt)
+{
+   if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+   return emulate_nm(ctxt);
+
+   asm volatile("fstpl %0" : "=m"(ctxt->dst.val));
+
+   return X86EMUL_CONTINUE;
+}
+
+static int em_fst(struct x86_emulate_ctxt *ctxt)
+{
+   if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+   return emulate_nm(ctxt);
+
+   asm volatile("fstl %0" : "=m"(ctxt->dst.val));
+
+   return X86EMUL_CONTINUE;
+}
+
 static int em_xorps(struct x86_emulate_ctxt *ctxt)
 {
const sse128_t *src = >src.vec_val;
@@ -4678,7 +4698,8 @@ static const struct escape escape_db = { {
 } };
 
 static const struct escape escape_dd = { {
-   N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
+   N, N, I(DstMem64 | Mov, em_fst), I(DstMem64 | Mov, em_fstp),
+   N, N, N, I(DstMem16 | Mov, em_fnstsw),
 }, {
/* 0xC0 - 0xC7 */
N, N, N, N, N, N, N, N,
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 56/92] kvm: x86: block any attempt to disable MSR interception if tracked by introspection

2019-08-09 Thread Adalbert Lazăr

From: Nicușor Cîțu 

Intercept all calls that might disable the MSR interception (writes) and
do nothing if that specific MSR is currently tracked by the introspection
tool.

CC: Sean Christopherson 
CC: Jim Mattson 
CC: Joerg Roedel 
CC: Vitaly Kuznetsov 
Signed-off-by: Nicușor Cîțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/kvmi_host.h |  6 +++
 arch/x86/kvm/kvmi.c  | 25 +
 arch/x86/kvm/svm.c   | 33 ++---
 arch/x86/kvm/vmx/vmx.c   | 63 +++-
 4 files changed, 88 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/kvmi_host.h b/arch/x86/include/asm/kvmi_host.h
index 8285d1eb0db6..86d90b7bed84 100644
--- a/arch/x86/include/asm/kvmi_host.h
+++ b/arch/x86/include/asm/kvmi_host.h
@@ -12,6 +12,7 @@ struct kvmi_arch_mem_access {
 #ifdef CONFIG_KVM_INTROSPECTION
 
 bool kvmi_msr_event(struct kvm_vcpu *vcpu, struct msr_data *msr);
+bool kvmi_monitored_msr(struct kvm_vcpu *vcpu, u32 msr);
 bool kvmi_cr_event(struct kvm_vcpu *vcpu, unsigned int cr,
   unsigned long old_value, unsigned long *new_value);
 
@@ -22,6 +23,11 @@ static inline bool kvmi_msr_event(struct kvm_vcpu *vcpu, 
struct msr_data *msr)
return true;
 }
 
+static inline bool kvmi_monitored_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+   return false;
+}
+
 static inline bool kvmi_cr_event(struct kvm_vcpu *vcpu, unsigned int cr,
 unsigned long old_value,
 unsigned long *new_value)
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 5dba4f87afef..fc6956b50da2 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -136,6 +136,31 @@ bool kvmi_msr_event(struct kvm_vcpu *vcpu, struct msr_data 
*msr)
return ret;
 }
 
+bool kvmi_monitored_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+   struct kvmi *ikvm;
+   bool ret = false;
+
+   if (!vcpu)
+   return false;
+
+   ikvm = kvmi_get(vcpu->kvm);
+   if (!ikvm)
+   return false;
+
+   if (test_msr_mask(vcpu, msr)) {
+   kvmi_warn_once(ikvm,
+  "Trying to disable write interception for MSR 
%x\n",
+  msr);
+   ret = true;
+   }
+
+   kvmi_put(vcpu->kvm);
+
+   return ret;
+}
+EXPORT_SYMBOL(kvmi_monitored_msr);
+
 static void *alloc_get_registers_reply(const struct kvmi_msg_hdr *msg,
   const struct kvmi_get_registers *req,
   size_t *rpl_size)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cdb315578979..e46a4c423545 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -18,6 +18,7 @@
 #define pr_fmt(fmt) "SVM: " fmt
 
 #include 
+#include 
 
 #include "irq.h"
 #include "mmu.h"
@@ -1049,13 +1050,19 @@ static bool msr_write_intercepted(struct kvm_vcpu 
*vcpu, unsigned msr)
return !!test_bit(bit_write,  );
 }
 
-static void set_msr_interception(u32 *msrpm, unsigned msr,
+static void set_msr_interception(struct vcpu_svm *svm,
+u32 *msrpm, unsigned int msr,
 int read, int write)
 {
u8 bit_read, bit_write;
unsigned long tmp;
u32 offset;
 
+#ifdef CONFIG_KVM_INTROSPECTION
+   if (!write && kvmi_monitored_msr(>vcpu, msr))
+   return;
+#endif /* CONFIG_KVM_INTROSPECTION */
+
/*
 * If this warning triggers extend the direct_access_msrs list at the
 * beginning of the file
@@ -1085,7 +1092,7 @@ static void svm_vcpu_init_msrpm(u32 *msrpm)
if (!direct_access_msrs[i].always)
continue;
 
-   set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+   set_msr_interception(NULL, msrpm, direct_access_msrs[i].index, 
1, 1);
}
 }
 
@@ -1137,10 +1144,10 @@ static void svm_enable_lbrv(struct vcpu_svm *svm)
u32 *msrpm = svm->msrpm;
 
svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
-   set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
-   set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
-   set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
-   set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+   set_msr_interception(svm, msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+   set_msr_interception(svm, msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+   set_msr_interception(svm, msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+   set_msr_interception(svm, msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 }
 
 static void svm_disable_lbrv(struct vcpu_svm *svm)
@@ -1148,10 +1155,10 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
u32 *msrpm = svm->msrpm;
 
svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
-   set_msr_intercept

[RFC PATCH v6 46/92] kvm: introspection: add KVMI_SET_PAGE_WRITE_BITMAP

2019-08-09 Thread Adalbert Lazăr

This command sets the subpage protection (SPP) write bitmap for an array
of guest physical addresses of 4KB bytes.

Co-developed-by: Yang Weijiang 
Signed-off-by: Yang Weijiang 
Co-developed-by: Adalbert Lazăr 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 66 ++
 arch/x86/kvm/kvmi.c| 30 ++
 include/uapi/linux/kvmi.h  | 13 ++
 virt/kvm/kvmi.c| 37 +
 virt/kvm/kvmi_int.h|  4 ++
 virt/kvm/kvmi_msg.c| 13 ++
 6 files changed, 163 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 2ffb92b0fa71..69557c63ff94 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -694,6 +694,72 @@ EPT view (0 is primary). On all other hardware it must be 
zero.
 * -KVM_EAGAIN - the selected vCPU can't be introspected yet
 * -KVM_ENOMEM - not enough memory to allocate the reply
 
+13. KVMI_SET_PAGE_WRITE_BITMAP
+--
+
+:Architectures: x86
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_set_page_write_bitmap {
+   __u16 view;
+   __u16 count;
+   __u32 padding;
+   struct kvmi_page_write_bitmap_entry entries[0];
+   };
+
+where::
+
+   struct kvmi_page_write_bitmap_entry {
+   __u64 gpa;
+   __u32 bitmap;
+   __u32 padding;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+
+Sets the subpage protection (SPP) write bitmap for an array of ``count``
+guest physical addresses of 4KB bytes.
+
+The command will make the changes starting with the first entry and
+it will stop on the first error. The introspection tool should handle
+the rollback.
+
+While the *KVMI_SET_PAGE_ACCESS* command can be used to write-protect a
+4KB page, this command can write-protect 128-bytes subpages inside of a
+4KB page by setting the corresponding bit to 1 (write allowed) or to 0
+(write disallowed). For example, to allow write access to the A and B
+subpages only, the bitmap must be set to::
+
+   BIT(A) | BIT(B)
+
+A and B must be a number between 0 (first subpage) and 31 (last subpage).
+
+Using this command to set all bits to 1 (allow write access for
+all subpages) will allow write access to the whole 4KB page (like a
+*KVMI_SET_PAGE_ACCESS* command with the *KVMI_PAGE_ACCESS_W* flag set)
+and vice versa.
+
+Using this command to set any bit to 0 will write-protect the whole 4KB
+page (like a *KVMI_SET_PAGE_ACCESS* command with the *KVMI_PAGE_ACCESS_W*
+flag cleared) and allow write access only for subpages with the
+corresponding bit set to 1.
+
+:Errors:
+
+* -KVM_EINVAL - the selected SPT view is invalid
+* -KVM_EOPNOTSUPP - a SPT view was selected but the hardware doesn't support it
+* -KVM_EOPNOTSUPP - the hardware doesn't support SPP or hasn't been enabled
+* -KVM_EINVAL - the write access is already allowed for the whole 4KB page
+* -KVM_EAGAIN - the selected vCPU can't be introspected yet
+* -KVM_ENOMEM - not enough memory to add the page tracking structures
+
 Events
 ==
 
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 356ec79936b3..fa290fbf1f75 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -304,6 +304,36 @@ int kvmi_arch_cmd_set_page_access(struct kvmi *ikvm,
return ec;
 }
 
+int kvmi_arch_cmd_set_page_write_bitmap(struct kvmi *ikvm,
+   const struct kvmi_msg_hdr *msg,
+   const struct kvmi_set_page_write_bitmap
+   *req)
+{
+   u16 k, n = req->count;
+   int ec = 0;
+
+   if (req->padding)
+   return -KVM_EINVAL;
+
+   if (msg->size < sizeof(*req) + req->count * sizeof(req->entries[0]))
+   return -KVM_EINVAL;
+
+   if (!kvmi_spp_enabled(ikvm))
+   return -KVM_EOPNOTSUPP;
+
+   if (req->view != 0) /* TODO */
+   return -KVM_EOPNOTSUPP;
+
+   for (k = 0; k < n && ec == 0; k++) {
+   u64 gpa = req->entries[k].gpa;
+   u32 bitmap = req->entries[k].bitmap;
+
+   ec = kvmi_cmd_set_page_write_bitmap(ikvm, gpa, bitmap);
+   }
+
+   return ec;
+}
+
 int kvmi_arch_cmd_control_spp(struct kvmi *ikvm)
 {
return kvm_arch_init_spp(ikvm->kvm);
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index 19a6a50df96b..0b3139c52a30 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -160,6 +160,19 @@ struct kvmi_get_page_write_bitmap_reply {
__u32 bitmap[0];
 };
 
+struct kvmi_page_write_bitmap_entry {
+   __u64 gpa;
+   __u32 bitmap;
+   __u32 padding;
+};
+
+struct kvmi_set_page_write_bitmap {
+   __u16 view;
+   __u16 count;
+   __u32 padding;
+   struct kvmi_page_

[RFC PATCH v6 65/92] kvm: introspection: add KVMI_EVENT_SINGLESTEP

2019-08-09 Thread Adalbert Lazăr

From: Nicușor Cîțu 

This event is sent when the current instruction has been single stepped
as a result of a KVMI_EVENT_PF event to which the introspection tool
set the singlestep field and responded with CONTINUE.

Signed-off-by: Nicușor Cîțu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 25 +++
 virt/kvm/kvmi.c| 40 ++
 2 files changed, 65 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 8721a470de87..572abab1f6ef 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -1574,3 +1574,28 @@ introspection has been enabled for this event (see 
**KVMI_CONTROL_EVENTS**).
KVMI_DESC_TR
 
 ``write`` is 1 if the descriptor was written, 0 otherwise.
+
+12. KVMI_EVENT_SINGLESTEP
+-
+
+:Architectures: x86
+:Versions: >= 1
+:Actions: CONTINUE, CRASH
+:Parameters:
+
+::
+
+   struct kvmi_event;
+
+:Returns:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_event_reply;
+
+This event is sent when the current instruction has been executed
+(as a result of a *KVMI_EVENT_PF* event to which the introspection
+tool set the ``singlestep`` field and responded with *CONTINUE*)
+and the introspection has been enabled for this event
+(see **KVMI_CONTROL_EVENTS**).
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index a3a5af9080a9..3dfedf3ae739 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -1182,6 +1182,44 @@ void kvmi_trap_event(struct kvm_vcpu *vcpu)
kvmi_put(vcpu->kvm);
 }
 
+static u32 kvmi_send_singlestep(struct kvm_vcpu *vcpu)
+{
+   int err, action;
+
+   err = kvmi_send_event(vcpu, KVMI_EVENT_SINGLESTEP, NULL, 0,
+ NULL, 0, );
+   if (err)
+   return KVMI_EVENT_ACTION_CONTINUE;
+
+   return action;
+}
+
+static void __kvmi_singlestep_event(struct kvm_vcpu *vcpu)
+{
+   u32 action;
+
+   action = kvmi_send_singlestep(vcpu);
+   switch (action) {
+   case KVMI_EVENT_ACTION_CONTINUE:
+   break;
+   default:
+   kvmi_handle_common_event_actions(vcpu, action, "SINGLESTEP");
+   }
+}
+
+static void kvmi_singlestep_event(struct kvm_vcpu *vcpu)
+{
+   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
+
+   if (!ivcpu->ss_requested)
+   return;
+
+   if (is_event_enabled(vcpu, KVMI_EVENT_SINGLESTEP))
+   __kvmi_singlestep_event(vcpu);
+
+   ivcpu->ss_requested = false;
+}
+
 static bool __kvmi_create_vcpu_event(struct kvm_vcpu *vcpu)
 {
u32 action;
@@ -1616,6 +1654,8 @@ void kvmi_stop_ss(struct kvm_vcpu *vcpu)
 
ivcpu->ss_owner = false;
 
+   kvmi_singlestep_event(vcpu);
+
 out:
kvmi_put(kvm);
 }
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 41/92] KVM: MMU: Enable Lazy mode SPPT setup

2019-08-09 Thread Adalbert Lazăr

From: Yang Weijiang 

If SPP subpages are set while the physical page are not
available in EPT leaf entry, the mapping is first stored
in SPP access bitmap buffer. SPPT setup is deferred to
access to the protected page, in EPT page fault handler,
the SPPT enries are set up.

Signed-off-by: Yang Weijiang 
Message-Id: <20190717133751.12910-9-weijiang.y...@intel.com>
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/mmu.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d59108a3ebbf..24222e3add91 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4400,6 +4400,26 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, 
gfn_t gfn, int level)
return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
 }
 
+static int kvm_enable_spp_protection(struct kvm *kvm, u64 gfn)
+{
+   struct kvm_subpage spp_info = {0};
+   struct kvm_memory_slot *slot;
+
+   slot = gfn_to_memslot(kvm, gfn);
+   if (!slot)
+   return -EFAULT;
+
+   spp_info.base_gfn = gfn;
+   spp_info.npages = 1;
+
+   if (kvm_mmu_get_subpages(kvm, _info, true) < 0)
+   return -EFAULT;
+
+   if (spp_info.access_map[0] != FULL_SPP_ACCESS)
+   kvm_mmu_set_subpages(kvm, _info, true);
+
+   return 0;
+}
 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
  bool prefault)
 {
@@ -4451,6 +4471,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t 
gpa, u32 error_code,
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, , , );
r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
+
+   if (vcpu->kvm->arch.spp_active && level == PT_PAGE_TABLE_LEVEL)
+   kvm_enable_spp_protection(vcpu->kvm, gfn);
+
spin_unlock(>kvm->mmu_lock);
 
return r;
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 63/92] kvm: introspection: add KVMI_EVENT_DESCRIPTOR

2019-08-09 Thread Adalbert Lazăr

From: Nicușor Cîțu 

This event is sent when IDTR, GDTR, LDTR or TR are accessed.

These could be used to implement a tiny agent which runs in the context
of an introspected guest and uses virtualized exceptions (#VE) and
alternate EPT views (VMFUNC #0) to filter converted VMEXITS. The events
of interested will be suppressed (after some appropriate guest-side
handling) while the rest will be sent to the introspector via a VMCALL.

Signed-off-by: Nicușor Cîțu 
Co-developed-by: Adalbert Lazăr 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 38 +++
 arch/x86/include/asm/kvm_host.h|  1 +
 arch/x86/include/uapi/asm/kvmi.h   | 11 +
 arch/x86/kvm/kvmi.c| 70 
 arch/x86/kvm/svm.c | 74 ++
 arch/x86/kvm/vmx/vmx.c | 59 +++-
 arch/x86/kvm/vmx/vmx.h |  2 +
 arch/x86/kvm/x86.c |  6 +++
 include/linux/kvm_host.h   |  1 +
 include/linux/kvmi.h   |  4 ++
 virt/kvm/kvmi.c|  2 +-
 virt/kvm/kvmi_int.h|  3 ++
 virt/kvm/kvmi_msg.c| 17 +++
 13 files changed, 285 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 2603813d1ee6..8721a470de87 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -1536,3 +1536,41 @@ It is used by the code residing inside the introspected 
guest to call the
 introspection tool and to report certain details about its operation. For
 example, a classic antimalware remediation tool can report what it has
 found during a scan.
+
+11. KVMI_EVENT_DESCRIPTOR
+-
+
+:Architecture: x86
+:Versions: >= 1
+:Actions: CONTINUE, RETRY, CRASH
+:Parameters:
+
+::
+
+   struct kvmi_event;
+   struct kvmi_event_descriptor {
+   __u8 descriptor;
+   __u8 write;
+   __u8 padding[6];
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_event_reply;
+
+This event is sent when a descriptor table register is accessed and the
+introspection has been enabled for this event (see **KVMI_CONTROL_EVENTS**).
+
+``kvmi_event`` and ``kvmi_event_descriptor`` are sent to the introspector.
+
+``descriptor`` can be one of::
+
+   KVMI_DESC_IDTR
+   KVMI_DESC_GDTR
+   KVMI_DESC_LDTR
+   KVMI_DESC_TR
+
+``write`` is 1 if the descriptor was written, 0 otherwise.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 91cd43a7a7bf..ad36a5fc2048 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1015,6 +1015,7 @@ struct kvm_x86_ops {
 
void (*msr_intercept)(struct kvm_vcpu *vcpu, unsigned int msr,
bool enable);
+   bool (*desc_intercept)(struct kvm_vcpu *vcpu, bool enable);
void (*cr3_write_exiting)(struct kvm_vcpu *vcpu, bool enable);
bool (*nested_pagefault)(struct kvm_vcpu *vcpu);
bool (*spt_fault)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/uapi/asm/kvmi.h b/arch/x86/include/uapi/asm/kvmi.h
index c3c96e6e2a26..0fa4ac3ed5d1 100644
--- a/arch/x86/include/uapi/asm/kvmi.h
+++ b/arch/x86/include/uapi/asm/kvmi.h
@@ -110,4 +110,15 @@ struct kvmi_get_mtrr_type_reply {
__u8 padding[7];
 };
 
+#define KVMI_DESC_IDTR 1
+#define KVMI_DESC_GDTR 2
+#define KVMI_DESC_LDTR 3
+#define KVMI_DESC_TR   4
+
+struct kvmi_event_descriptor {
+   __u8 descriptor;
+   __u8 write;
+   __u8 padding[6];
+};
+
 #endif /* _UAPI_ASM_X86_KVMI_H */
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 02e026ef5ed7..04cac5b8a4d0 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -161,6 +161,38 @@ bool kvmi_monitored_msr(struct kvm_vcpu *vcpu, u32 msr)
 }
 EXPORT_SYMBOL(kvmi_monitored_msr);
 
+static int kvmi_control_event_desc(struct kvm_vcpu *vcpu, bool enable)
+{
+   int err = 0;
+
+   if (enable) {
+   if (!is_event_enabled(vcpu, KVMI_EVENT_DESCRIPTOR))
+   if (!kvm_arch_vcpu_intercept_desc(vcpu, true))
+   err = -KVM_EOPNOTSUPP;
+   } else if (is_event_enabled(vcpu, KVMI_EVENT_DESCRIPTOR)) {
+   kvm_arch_vcpu_intercept_desc(vcpu, false);
+   }
+
+   return err;
+}
+
+int kvmi_arch_cmd_control_event(struct kvm_vcpu *vcpu, unsigned int event_id,
+   bool enable)
+{
+   int err;
+
+   switch (event_id) {
+   case KVMI_EVENT_DESCRIPTOR:
+   err = kvmi_control_event_desc(vcpu, enable);
+   break;
+   default:
+   err = 0;
+   break;
+   }
+
+   return err;
+}
+
 static void *alloc_get_registers_reply(const struct kvmi_msg_hdr *msg,
   const struct kvmi_get_registers *

[RFC PATCH v6 19/92] kvm: introspection: add KVMI_EVENT_CREATE_VCPU

2019-08-09 Thread Adalbert Lazăr

From: Mircea Cîrjaliu 

This event is sent when a vCPU is ready to be introspected.

Signed-off-by: Mircea Cîrjaliu 
Co-developed-by: Adalbert Lazăr 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 23 +++
 virt/kvm/kvmi.c| 47 ++
 virt/kvm/kvmi_int.h|  1 +
 virt/kvm/kvmi_msg.c| 12 
 4 files changed, 83 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 28e1a1c80551..b29cd1b80b4f 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -513,3 +513,26 @@ pause/stop/migrate the guest (see **Unhooking**) and the 
introspection
 has been enabled for this event (see **KVMI_CONTROL_VM_EVENTS**).
 The introspection tool has a chance to unhook and close the KVMI channel
 (signaling that the operation can proceed).
+
+2. KVMI_EVENT_CREATE_VCPU
+-
+
+:Architectures: all
+:Versions: >= 1
+:Actions: CONTINUE, CRASH
+:Parameters:
+
+::
+
+   struct kvmi_event;
+
+:Returns:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_event_reply;
+
+This event is sent when a new vCPU is created and the introspection has
+been enabled for this event (see *KVMI_CONTROL_VM_EVENTS*).
+
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index 7eda49bf65c4..d0d9adf5b6ed 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -13,6 +13,7 @@
 static struct kmem_cache *msg_cache;
 static struct kmem_cache *job_cache;
 
+static bool kvmi_create_vcpu_event(struct kvm_vcpu *vcpu);
 static void kvmi_abort_events(struct kvm *kvm);
 
 void *kvmi_msg_alloc(void)
@@ -150,6 +151,11 @@ static struct kvmi_job *kvmi_pull_job(struct kvmi_vcpu 
*ivcpu)
return job;
 }
 
+static void kvmi_job_create_vcpu(struct kvm_vcpu *vcpu, void *ctx)
+{
+   kvmi_create_vcpu_event(vcpu);
+}
+
 static bool alloc_ivcpu(struct kvm_vcpu *vcpu)
 {
struct kvmi_vcpu *ivcpu;
@@ -245,6 +251,9 @@ int kvmi_vcpu_init(struct kvm_vcpu *vcpu)
goto out;
}
 
+   if (kvmi_add_job(vcpu, kvmi_job_create_vcpu, NULL, NULL))
+   ret = -ENOMEM;
+
 out:
kvmi_put(vcpu->kvm);
 
@@ -330,6 +339,10 @@ int kvmi_hook(struct kvm *kvm, const struct 
kvm_introspection *qemu)
err = -ENOMEM;
goto err_alloc;
}
+   if (kvmi_add_job(vcpu, kvmi_job_create_vcpu, NULL, NULL)) {
+   err = -ENOMEM;
+   goto err_alloc;
+   }
}
 
/* interact with other kernel components after structure allocation */
@@ -551,6 +564,40 @@ void kvmi_handle_common_event_actions(struct kvm_vcpu 
*vcpu, u32 action,
}
 }
 
+static bool __kvmi_create_vcpu_event(struct kvm_vcpu *vcpu)
+{
+   u32 action;
+   bool ret = false;
+
+   action = kvmi_msg_send_create_vcpu(vcpu);
+   switch (action) {
+   case KVMI_EVENT_ACTION_CONTINUE:
+   ret = true;
+   break;
+   default:
+   kvmi_handle_common_event_actions(vcpu, action, "CREATE");
+   }
+
+   return ret;
+}
+
+static bool kvmi_create_vcpu_event(struct kvm_vcpu *vcpu)
+{
+   struct kvmi *ikvm;
+   bool ret = true;
+
+   ikvm = kvmi_get(vcpu->kvm);
+   if (!ikvm)
+   return true;
+
+   if (test_bit(KVMI_EVENT_CREATE_VCPU, ikvm->vm_ev_mask))
+   ret = __kvmi_create_vcpu_event(vcpu);
+
+   kvmi_put(vcpu->kvm);
+
+   return ret;
+}
+
 void kvmi_run_jobs(struct kvm_vcpu *vcpu)
 {
struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h
index 9750a9b9902b..c21f0fd5e16c 100644
--- a/virt/kvm/kvmi_int.h
+++ b/virt/kvm/kvmi_int.h
@@ -123,6 +123,7 @@ bool kvmi_sock_get(struct kvmi *ikvm, int fd);
 void kvmi_sock_shutdown(struct kvmi *ikvm);
 void kvmi_sock_put(struct kvmi *ikvm);
 bool kvmi_msg_process(struct kvmi *ikvm);
+u32 kvmi_msg_send_create_vcpu(struct kvm_vcpu *vcpu);
 int kvmi_msg_send_unhook(struct kvmi *ikvm);
 
 /* kvmi.c */
diff --git a/virt/kvm/kvmi_msg.c b/virt/kvm/kvmi_msg.c
index 0c7c1e968007..8e8af572a4f4 100644
--- a/virt/kvm/kvmi_msg.c
+++ b/virt/kvm/kvmi_msg.c
@@ -725,3 +725,15 @@ int kvmi_msg_send_unhook(struct kvmi *ikvm)
 
return kvmi_sock_write(ikvm, vec, n, msg_size);
 }
+
+u32 kvmi_msg_send_create_vcpu(struct kvm_vcpu *vcpu)
+{
+   int err, action;
+
+   err = kvmi_send_event(vcpu, KVMI_EVENT_CREATE_VCPU, NULL, 0,
+ NULL, 0, );
+   if (err)
+   return KVMI_EVENT_ACTION_CONTINUE;
+
+   return action;
+}
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 26/92] kvm: x86: add kvm_mmu_nested_pagefault()

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This is needed to filter #PF introspection events.

Signed-off-by: Mihai Donțu 
Co-developed-by: Nicușor Cîțu 
Signed-off-by: Nicușor Cîțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/kvm_host.h | 4 
 arch/x86/kvm/mmu.c  | 5 +
 arch/x86/kvm/svm.c  | 7 +++
 arch/x86/kvm/vmx/vmx.c  | 9 +
 4 files changed, 25 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2d6bde6fa59f..7da1137a2b82 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1004,6 +1004,8 @@ struct kvm_x86_ops {
bool (*has_emulated_msr)(int index);
void (*cpuid_update)(struct kvm_vcpu *vcpu);
 
+   bool (*nested_pagefault)(struct kvm_vcpu *vcpu);
+
struct kvm *(*vm_alloc)(void);
void (*vm_free)(struct kvm *);
int (*vm_init)(struct kvm *kvm);
@@ -1593,4 +1595,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
 #define put_smstate(type, buf, offset, val)  \
*(type *)((buf) + (offset) - 0x7e00) = val
 
+bool kvm_mmu_nested_pagefault(struct kvm_vcpu *vcpu);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff053f17b8c2..9eaf6cc776a9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -6169,3 +6169,8 @@ void kvm_mmu_module_exit(void)
unregister_shrinker(_shrinker);
mmu_audit_disable();
 }
+
+bool kvm_mmu_nested_pagefault(struct kvm_vcpu *vcpu)
+{
+   return kvm_x86_ops->nested_pagefault(vcpu);
+}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f13a3a24d360..3c099c56099c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -7098,6 +7098,11 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
return -ENODEV;
 }
 
+static bool svm_nested_pagefault(struct kvm_vcpu *vcpu)
+{
+   return false;
+}
+
 static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -7109,6 +7114,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
.has_emulated_msr = svm_has_emulated_msr,
 
+   .nested_pagefault = svm_nested_pagefault,
+
.vcpu_create = svm_create_vcpu,
.vcpu_free = svm_free_vcpu,
.vcpu_reset = svm_vcpu_reset,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 30a6bcd735ec..e10ee8fd1c67 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7682,6 +7682,13 @@ static __exit void hardware_unsetup(void)
free_kvm_area();
 }
 
+static bool vmx_nested_pagefault(struct kvm_vcpu *vcpu)
+{
+   if (vcpu->arch.exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)
+   return false;
+   return true;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -7693,6 +7700,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_accelerated_tpr = report_flexpriority,
.has_emulated_msr = vmx_has_emulated_msr,
 
+   .nested_pagefault = vmx_nested_pagefault,
+
.vm_init = vmx_vm_init,
.vm_alloc = vmx_vm_alloc,
.vm_free = vmx_vm_free,
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 25/92] kvm: x86: intercept the write access on sidt and other emulated instructions

2019-08-09 Thread Adalbert Lazăr

This is needed for the introspection subsystem to track the changes to
descriptor table registers.

CC: Joerg Roedel 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/x86.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7aef002be551..c28e2a20dec2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5185,11 +5185,14 @@ static int kvm_write_guest_virt_helper(gva_t addr, void 
*val, unsigned int bytes
 
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
+   if (!kvm_page_track_prewrite(vcpu, gpa, addr, data, towrite))
+   return X86EMUL_RETRY_INSTR;
ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
goto out;
}
+   kvm_page_track_write(vcpu, gpa, addr, data, towrite);
 
bytes -= towrite;
data += towrite;
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 69/92] kvm: x86: keep the page protected if tracked by the introspection tool

2019-08-09 Thread Adalbert Lazăr

This patch might be obsolete thanks to single-stepping.

Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/x86.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2c06de73a784..06f44ce8ed07 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6311,7 +6311,8 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, 
gva_t cr2,
indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
spin_unlock(>kvm->mmu_lock);
 
-   if (indirect_shadow_pages)
+   if (indirect_shadow_pages
+   && !kvmi_tracked_gfn(vcpu, gpa_to_gfn(gpa)))
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
 
return true;
@@ -6322,7 +6323,8 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, 
gva_t cr2,
 * and it failed try to unshadow page and re-enter the
 * guest to let CPU execute the instruction.
 */
-   kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+   if (!kvmi_tracked_gfn(vcpu, gpa_to_gfn(gpa)))
+   kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
 
/*
 * If the access faults on its page table, it can not
@@ -6374,6 +6376,9 @@ static bool retry_instruction(struct x86_emulate_ctxt 
*ctxt,
if (!vcpu->arch.mmu->direct_map)
gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
 
+   if (kvmi_tracked_gfn(vcpu, gpa_to_gfn(gpa)))
+   return false;
+
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
 
return true;
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 91/92] kvm: x86: emulate lock cmpxchg16b m128

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This patch adds support for lock cmpxchg16b m128 by extending the
existent emulation for lock cmpxchg8b m64.

For implementing the atomic operation, we use an explicit assembler
statement, as cmpxchg_double() does not provide the contents of the
memory on failure. As before, writeback is completely disabled as the
operation is executed directly on guest memory, unless the architecture
does not advertise CMPXCHG16B in CPUID.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/emulate.c | 117 ++---
 arch/x86/kvm/x86.c |  37 -
 2 files changed, 122 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2038e42c1eae..a37ad63836ea 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2318,46 +2318,103 @@ static int em_call_near_abs(struct x86_emulate_ctxt 
*ctxt)
return rc;
 }
 
-static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
+static int em_cmpxchg8b_locked(struct x86_emulate_ctxt *ctxt)
 {
-   u64 old;
+   int rc;
+   ulong linear;
+   u64 new = (reg_read(ctxt, VCPU_REGS_RBX) & (u32)-1) |
+   ((reg_read(ctxt, VCPU_REGS_RCX) & (u32)-1) << 32);
+   u64 old = (reg_read(ctxt, VCPU_REGS_RAX) & (u32)-1) |
+   ((reg_read(ctxt, VCPU_REGS_RDX) & (u32)-1) << 32);
 
-   if (ctxt->lock_prefix) {
-   int rc;
-   ulong linear;
-   u64 new = (reg_read(ctxt, VCPU_REGS_RBX) & (u32)-1) |
-   ((reg_read(ctxt, VCPU_REGS_RCX) & (u32)-1) << 32);
+   /* disable writeback altogether */
+   ctxt->d |= NoWrite;
 
-   old = (reg_read(ctxt, VCPU_REGS_RAX) & (u32)-1) |
-   ((reg_read(ctxt, VCPU_REGS_RDX) & (u32)-1) << 32);
+   rc = linearize(ctxt, ctxt->dst.addr.mem, 8, true, );
+   if (rc != X86EMUL_CONTINUE)
+   return rc;
 
-   /* disable writeback altogether */
-   ctxt->d &= ~SrcWrite;
-   ctxt->d |= NoWrite;
+   rc = ctxt->ops->cmpxchg_emulated(ctxt, linear, , ,
+8, >exception);
 
-   rc = linearize(ctxt, ctxt->dst.addr.mem, 8, true, );
-   if (rc != X86EMUL_CONTINUE)
-   return rc;
 
-   rc = ctxt->ops->cmpxchg_emulated(ctxt, linear, , ,
-ctxt->dst.bytes,
->exception);
+   switch (rc) {
+   case X86EMUL_CONTINUE:
+   ctxt->eflags |= X86_EFLAGS_ZF;
+   break;
+   case X86EMUL_CMPXCHG_FAILED:
+   *reg_write(ctxt, VCPU_REGS_RAX) = old & (u32)-1;
+   *reg_write(ctxt, VCPU_REGS_RDX) = (old >> 32) & (u32)-1;
 
-   switch (rc) {
-   case X86EMUL_CONTINUE:
-   ctxt->eflags |= X86_EFLAGS_ZF;
-   break;
-   case X86EMUL_CMPXCHG_FAILED:
-   *reg_write(ctxt, VCPU_REGS_RAX) = old & (u32)-1;
-   *reg_write(ctxt, VCPU_REGS_RDX) = (old >> 32) & (u32)-1;
+   ctxt->eflags &= ~X86_EFLAGS_ZF;
 
-   ctxt->eflags &= ~X86_EFLAGS_ZF;
+   rc = X86EMUL_CONTINUE;
+   break;
+   }
 
-   rc = X86EMUL_CONTINUE;
-   break;
-   }
+   return rc;
+}
+
+#ifdef CONFIG_X86_64
+static int em_cmpxchg16b_locked(struct x86_emulate_ctxt *ctxt)
+{
+   int rc;
+   ulong linear;
+   u64 new[2] = {
+   reg_read(ctxt, VCPU_REGS_RBX),
+   reg_read(ctxt, VCPU_REGS_RCX)
+   };
+   u64 old[2] = {
+   reg_read(ctxt, VCPU_REGS_RAX),
+   reg_read(ctxt, VCPU_REGS_RDX)
+   };
 
+   /* disable writeback altogether */
+   ctxt->d |= NoWrite;
+
+   rc = linearize(ctxt, ctxt->dst.addr.mem, 16, true, );
+   if (rc != X86EMUL_CONTINUE)
return rc;
+
+   if (linear % 16)
+   return emulate_gp(ctxt, 0);
+
+   rc = ctxt->ops->cmpxchg_emulated(ctxt, linear, old, new,
+16, >exception);
+
+   switch (rc) {
+   case X86EMUL_CONTINUE:
+   ctxt->eflags |= X86_EFLAGS_ZF;
+   break;
+   case X86EMUL_CMPXCHG_FAILED:
+   *reg_write(ctxt, VCPU_REGS_RAX) = old[0];
+   *reg_write(ctxt, VCPU_REGS_RDX) = old[1];
+
+   ctxt->eflags &= ~X86_EFLAGS_ZF;
+
+   rc = X86EMUL_CONTINUE;
+   break;
+   }
+
+   return rc;
+}
+#else
+static int em_cmpxchg16b_locked(struct x86_emulate_ctxt *ctxt)
+{
+   return X86EMUL_UNHANDLEA

[RFC PATCH v6 13/92] kvm: introspection: make the vCPU wait even when its jobs list is empty

2019-08-09 Thread Adalbert Lazăr

Usually, the vCPU thread will run the functions from its jobs list
(unless the thread is SIGKILL-ed) and continue to guest when the
list is empty. But, there are cases when it has to wait for something
(e.g. another vCPU runs in single-step mode, or the current vCPU waits
for an event reply from the introspection tool).

In these cases, it will append a "wait job" into its own list, which
will do (a) nothing if the list is not empty or it doesn't have to wait
any longer or (b) wait (in the same wake-queue used by KVM) until it
is kicked. It should be OK if the receiving worker appends a new job in
the same time.

Signed-off-by: Adalbert Lazăr 
---
 include/linux/swait.h | 11 ++
 virt/kvm/kvmi.c   | 80 +++
 virt/kvm/kvmi_int.h   |  2 ++
 3 files changed, 93 insertions(+)

diff --git a/include/linux/swait.h b/include/linux/swait.h
index 73e06e9986d4..2486625e7fb4 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -297,4 +297,15 @@ do {   
\
__ret;  \
 })
 
+#define __swait_event_killable(wq, condition)  \
+   ___swait_event(wq, condition, TASK_KILLABLE, 0, schedule()) \
+
+#define swait_event_killable(wq, condition)\
+({ \
+   int __ret = 0;  \
+   if (!(condition))   \
+   __ret = __swait_event_killable(wq, condition);  \
+   __ret;  \
+})
+
 #endif /* _LINUX_SWAIT_H */
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index 07ebd1c629b0..3c884dc0e38c 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -135,6 +135,19 @@ static void kvmi_free_job(struct kvmi_job *job)
kmem_cache_free(job_cache, job);
 }
 
+static struct kvmi_job *kvmi_pull_job(struct kvmi_vcpu *ivcpu)
+{
+   struct kvmi_job *job = NULL;
+
+   spin_lock(>job_lock);
+   job = list_first_entry_or_null(>job_list, typeof(*job), link);
+   if (job)
+   list_del(>link);
+   spin_unlock(>job_lock);
+
+   return job;
+}
+
 static bool alloc_ivcpu(struct kvm_vcpu *vcpu)
 {
struct kvmi_vcpu *ivcpu;
@@ -496,6 +509,73 @@ void kvmi_destroy_vm(struct kvm *kvm)
wait_for_completion_killable(>kvmi_completed);
 }
 
+void kvmi_run_jobs(struct kvm_vcpu *vcpu)
+{
+   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
+   struct kvmi_job *job;
+
+   while ((job = kvmi_pull_job(ivcpu))) {
+   job->fct(vcpu, job->ctx);
+   kvmi_free_job(job);
+   }
+}
+
+static bool done_waiting(struct kvm_vcpu *vcpu)
+{
+   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
+
+   return !list_empty(>job_list);
+}
+
+static void kvmi_job_wait(struct kvm_vcpu *vcpu, void *ctx)
+{
+   struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
+   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
+   int err;
+
+   err = swait_event_killable(*wq, done_waiting(vcpu));
+
+   if (err)
+   ivcpu->killed = true;
+}
+
+int kvmi_run_jobs_and_wait(struct kvm_vcpu *vcpu)
+{
+   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
+   int err = 0;
+
+   for (;;) {
+   kvmi_run_jobs(vcpu);
+
+   if (ivcpu->killed) {
+   err = -1;
+   break;
+   }
+
+   kvmi_add_job(vcpu, kvmi_job_wait, NULL, NULL);
+   }
+
+   return err;
+}
+
+void kvmi_handle_requests(struct kvm_vcpu *vcpu)
+{
+   struct kvmi *ikvm;
+
+   ikvm = kvmi_get(vcpu->kvm);
+   if (!ikvm)
+   return;
+
+   for (;;) {
+   int err = kvmi_run_jobs_and_wait(vcpu);
+
+   if (err)
+   break;
+   }
+
+   kvmi_put(vcpu->kvm);
+}
+
 int kvmi_cmd_control_vm_events(struct kvmi *ikvm, unsigned int event_id,
   bool enable)
 {
diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h
index 97f91a568096..47418e9a86f6 100644
--- a/virt/kvm/kvmi_int.h
+++ b/virt/kvm/kvmi_int.h
@@ -85,6 +85,8 @@ struct kvmi_job {
 struct kvmi_vcpu {
struct list_head job_list;
spinlock_t job_lock;
+
+   bool killed;
 };
 
 #define IKVM(kvm) ((struct kvmi *)((kvm)->kvmi))
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 42/92] KVM: MMU: Handle host memory remapping and reclaim

2019-08-09 Thread Adalbert Lazăr

From: Yang Weijiang 

Host page swapping/migration may change the translation in
EPT leaf entry, if the target page is SPP protected,
re-enable SPP protection in MMU notifier. If SPPT shadow
page is reclaimed, the level1 pages don't have rmap to clear.

Signed-off-by: Yang Weijiang 
Message-Id: <20190717133751.12910-10-weijiang.y...@intel.com>
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/mmu.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 24222e3add91..0b859b1797f6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2004,6 +2004,24 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct 
kvm_rmap_head *rmap_head,
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
 
+   /*
+* if it's EPT leaf entry and the physical page is
+* SPP protected, then re-enable SPP protection for
+* the page.
+*/
+   if (kvm->arch.spp_active &&
+   level == PT_PAGE_TABLE_LEVEL) {
+   struct kvm_subpage spp_info = {0};
+   int i;
+
+   spp_info.base_gfn = gfn;
+   spp_info.npages = 1;
+   i = kvm_mmu_get_subpages(kvm, _info, true);
+   if (i == 1 &&
+   spp_info.access_map[0] != FULL_SPP_ACCESS)
+   new_spte |= PT_SPP_MASK;
+   }
+
new_spte = mark_spte_for_access_track(new_spte);
 
mmu_spte_clear_track_bits(sptep);
@@ -2905,6 +2923,10 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct 
kvm_mmu_page *sp,
pte = *spte;
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level)) {
+   /* SPPT leaf entries don't have rmaps*/
+   if (sp->role.level == PT_PAGE_TABLE_LEVEL &&
+   is_spp_spte(sp))
+   return true;
drop_spte(kvm, spte);
if (is_large_pte(pte))
--kvm->stat.lpages;
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 47/92] kvm: introspection: add KVMI_READ_PHYSICAL and KVMI_WRITE_PHYSICAL

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

These commands allows the introspection tool to read/write from/to the
guest memory.

Signed-off-by: Mihai Donțu 
Co-developed-by: Adalbert Lazăr 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst |  60 
 include/uapi/linux/kvmi.h  |  11 +++
 virt/kvm/kvmi.c| 107 +
 virt/kvm/kvmi_int.h|   7 ++
 virt/kvm/kvmi_msg.c|  42 +++
 5 files changed, 227 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 69557c63ff94..eef32107837a 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -760,6 +760,66 @@ corresponding bit set to 1.
 * -KVM_EAGAIN - the selected vCPU can't be introspected yet
 * -KVM_ENOMEM - not enough memory to add the page tracking structures
 
+14. KVMI_READ_PHYSICAL
+--
+
+:Architectures: all
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_read_physical {
+   __u64 gpa;
+   __u64 size;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+   __u8 data[0];
+
+Reads from the guest memory.
+
+Currently, the size must be non-zero and the read must be restricted to
+one page (offset + size <= PAGE_SIZE).
+
+:Errors:
+
+* -KVM_EINVAL - the specified gpa is invalid
+
+15. KVMI_WRITE_PHYSICAL
+---
+
+:Architectures: all
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_write_physical {
+   __u64 gpa;
+   __u64 size;
+   __u8  data[0];
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code
+
+Writes into the guest memory.
+
+Currently, the size must be non-zero and the write must be restricted to
+one page (offset + size <= PAGE_SIZE).
+
+:Errors:
+
+* -KVM_EINVAL - the specified gpa is invalid
+
 Events
 ==
 
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index 0b3139c52a30..be3f066f314e 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -191,6 +191,17 @@ struct kvmi_control_vm_events {
__u32 padding2;
 };
 
+struct kvmi_read_physical {
+   __u64 gpa;
+   __u64 size;
+};
+
+struct kvmi_write_physical {
+   __u64 gpa;
+   __u64 size;
+   __u8  data[0];
+};
+
 struct kvmi_vcpu_hdr {
__u16 vcpu;
__u16 padding1;
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index d2bebef98d8d..a84eb150e116 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2017-2019 Bitdefender S.R.L.
  *
  */
+#include 
 #include 
 #include "kvmi_int.h"
 #include 
@@ -1220,6 +1221,112 @@ int kvmi_cmd_set_page_write_bitmap(struct kvmi *ikvm, 
u64 gpa,
return kvmi_set_gfn_access(ikvm->kvm, gfn, access, write_bitmap);
 }
 
+unsigned long gfn_to_hva_safe(struct kvm *kvm, gfn_t gfn)
+{
+   unsigned long hva;
+   int srcu_idx;
+
+   srcu_idx = srcu_read_lock(>srcu);
+   hva = gfn_to_hva(kvm, gfn);
+   srcu_read_unlock(>srcu, srcu_idx);
+
+   return hva;
+}
+
+static long get_user_pages_remote_unlocked(struct mm_struct *mm,
+   unsigned long start,
+   unsigned long nr_pages,
+   unsigned int gup_flags,
+   struct page **pages)
+{
+   long ret;
+   struct task_struct *tsk = NULL;
+   struct vm_area_struct **vmas = NULL;
+   int locked = 1;
+
+   down_read(>mmap_sem);
+   ret = get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+   pages, vmas, );
+   if (locked)
+   up_read(>mmap_sem);
+   return ret;
+}
+
+static void *get_page_ptr(struct kvm *kvm, gpa_t gpa, struct page **page,
+ bool write)
+{
+   unsigned int flags = write ? FOLL_WRITE : 0;
+   unsigned long hva;
+
+   *page = NULL;
+
+   hva = gfn_to_hva_safe(kvm, gpa_to_gfn(gpa));
+
+   if (kvm_is_error_hva(hva)) {
+   kvmi_err(IKVM(kvm), "Invalid gpa %llx\n", gpa);
+   return NULL;
+   }
+
+   if (get_user_pages_remote_unlocked(kvm->mm, hva, 1, flags, page) != 1) {
+   kvmi_err(IKVM(kvm),
+"Failed to get the page for hva %lx gpa %llx\n",
+hva, gpa);
+   return NULL;
+   }
+
+   return kmap_atomic(*page);
+}
+
+static void put_page_ptr(void *ptr, struct page *page)
+{
+   if (ptr)
+   kunmap_atomic(ptr);
+   if (page)
+   put_page(page);
+}
+
+int kvmi_cmd_read_physical(struct kvm *kvm, u64 gpa, u64 size, int(*send)(
+   struct kvmi *, const struct kvmi_msg_hdr *,
+   int err, const void *buf, size_t),
+   const struct kvmi_msg_hdr *ctx)
+{
+   int err, ec = 0;
+   struct page *page = NULL;
+   void *ptr_page = NULL, *ptr = NULL;
+   size_t ptr_size = 0;
+
+   ptr_page = get_page_ptr(kvm, gpa, , false

[RFC PATCH v6 59/92] kvm: introspection: add KVMI_EVENT_XSETBV

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This event is sent when the extended control register XCR0 is going to
be changed.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 25 +++
 arch/x86/include/asm/kvmi_host.h   |  5 
 arch/x86/kvm/kvmi.c| 39 ++
 arch/x86/kvm/x86.c |  5 
 4 files changed, 74 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index e58f0e22f188..1d2431639770 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -1444,3 +1444,28 @@ register (see **KVMI_CONTROL_EVENTS**).
 
 ``kvmi_event``, the MSR number, the old value and the new value are
 sent to the introspector. The *CONTINUE* action will set the ``new_val``.
+
+8. KVMI_EVENT_XSETBV
+
+
+:Architectures: x86
+:Versions: >= 1
+:Actions: CONTINUE, CRASH
+:Parameters:
+
+::
+
+   struct kvmi_event;
+
+:Returns:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_event_reply;
+
+This event is sent when the extended control register XCR0 is going
+to be changed and the introspection has been enabled for this event
+(see *KVMI_CONTROL_EVENTS*).
+
+``kvmi_event`` is sent to the introspector.
diff --git a/arch/x86/include/asm/kvmi_host.h b/arch/x86/include/asm/kvmi_host.h
index 86d90b7bed84..3f066e7feee2 100644
--- a/arch/x86/include/asm/kvmi_host.h
+++ b/arch/x86/include/asm/kvmi_host.h
@@ -15,6 +15,7 @@ bool kvmi_msr_event(struct kvm_vcpu *vcpu, struct msr_data 
*msr);
 bool kvmi_monitored_msr(struct kvm_vcpu *vcpu, u32 msr);
 bool kvmi_cr_event(struct kvm_vcpu *vcpu, unsigned int cr,
   unsigned long old_value, unsigned long *new_value);
+void kvmi_xsetbv_event(struct kvm_vcpu *vcpu);
 
 #else /* CONFIG_KVM_INTROSPECTION */
 
@@ -35,6 +36,10 @@ static inline bool kvmi_cr_event(struct kvm_vcpu *vcpu, 
unsigned int cr,
return true;
 }
 
+static inline void kvmi_xsetbv_event(struct kvm_vcpu *vcpu)
+{
+}
+
 #endif /* CONFIG_KVM_INTROSPECTION */
 
 #endif /* _ASM_X86_KVMI_HOST_H */
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 0114ed66f4f3..0e9c91d2f282 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -389,6 +389,45 @@ bool kvmi_cr_event(struct kvm_vcpu *vcpu, unsigned int cr,
return ret;
 }
 
+static u32 kvmi_send_xsetbv(struct kvm_vcpu *vcpu)
+{
+   int err, action;
+
+   err = kvmi_send_event(vcpu, KVMI_EVENT_XSETBV, NULL, 0,
+ NULL, 0, );
+   if (err)
+   return KVMI_EVENT_ACTION_CONTINUE;
+
+   return action;
+}
+
+static void __kvmi_xsetbv_event(struct kvm_vcpu *vcpu)
+{
+   u32 action;
+
+   action = kvmi_send_xsetbv(vcpu);
+   switch (action) {
+   case KVMI_EVENT_ACTION_CONTINUE:
+   break;
+   default:
+   kvmi_handle_common_event_actions(vcpu, action, "XSETBV");
+   }
+}
+
+void kvmi_xsetbv_event(struct kvm_vcpu *vcpu)
+{
+   struct kvmi *ikvm;
+
+   ikvm = kvmi_get(vcpu->kvm);
+   if (!ikvm)
+   return;
+
+   if (is_event_enabled(vcpu, KVMI_EVENT_XSETBV))
+   __kvmi_xsetbv_event(vcpu);
+
+   kvmi_put(vcpu->kvm);
+}
+
 bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
u8 access)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05ff23180355..278a286ba262 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -868,6 +868,11 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, 
u64 xcr)
 
 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
+#ifdef CONFIG_KVM_INTROSPECTION
+   if (xcr != vcpu->arch.xcr0)
+   kvmi_xsetbv_event(vcpu);
+#endif /* CONFIG_KVM_INTROSPECTION */
+
if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
__kvm_set_xcr(vcpu, index, xcr)) {
kvm_inject_gp(vcpu, 0);
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 73/92] kvm: introspection: use remote mapping

2019-08-09 Thread Adalbert Lazăr

From: Mircea Cîrjaliu 

This commit adds the missing KVMI_GET_MAP_TOKEN command and handle the
hypercalls used to map/unmap guest pages.

Suggested-by: Paolo Bonzini 
Signed-off-by: Mircea Cîrjaliu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst |  39 
 arch/x86/kvm/Makefile  |   2 +-
 arch/x86/kvm/x86.c |   6 +
 include/linux/kvmi.h   |   3 +
 virt/kvm/kvmi.c|  12 +-
 virt/kvm/kvmi_int.h|  10 +
 virt/kvm/kvmi_mem.c| 319 +
 virt/kvm/kvmi_msg.c|  15 ++
 8 files changed, 404 insertions(+), 2 deletions(-)
 create mode 100644 virt/kvm/kvmi_mem.c

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 572abab1f6ef..b12e14f14c21 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -1144,6 +1144,45 @@ Returns the guest memory type for a specific physical 
address.
 * -KVM_EINVAL - padding is not zero
 * -KVM_EAGAIN - the selected vCPU can't be introspected yet
 
+25. KVMI_GET_MAP_TOKEN
+--
+
+:Architecture: all
+:Versions: >= 1
+:Parameters: none
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+   struct kvmi_get_map_token_reply {
+   struct kvmi_map_mem_token token;
+   };
+
+Where::
+
+   struct kvmi_map_mem_token {
+   __u64 token[4];
+   };
+
+Requests a token for a memory map operation.
+
+On this command, the host generates a random token to be used (once)
+to map a physical page from the introspected guest. The introspector
+could use the token with the KVM_INTRO_MEM_MAP ioctl (on /dev/kvmmem)
+to map a guest physical page to one of its memory pages. The ioctl,
+in turn, will use the KVM_HC_MEM_MAP hypercall (see hypercalls.txt).
+
+The guest kernel exposing /dev/kvmmem keeps a list with all the mappings
+(to all the guests introspected by the tool) in order to unmap them
+(using the KVM_HC_MEM_UNMAP hypercall) when /dev/kvmmem is closed or on
+demand (using the KVM_INTRO_MEM_UNMAP ioctl).
+
+:Errors:
+
+* -KVM_EAGAIN - too many tokens have accumulated
+* -KVM_ENOMEM - not enough memory to allocate a new token
+
 Events
 ==
 
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 673cf37c0747..5bea446219ca 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ KVM := ../../../virt/kvm
 kvm-y  += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
 kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
-kvm-$(CONFIG_KVM_INTROSPECTION) += $(KVM)/kvmi.o $(KVM)/kvmi_msg.o kvmi.o
+kvm-$(CONFIG_KVM_INTROSPECTION) += $(KVM)/kvmi.o $(KVM)/kvmi_msg.o 
$(KVM)/kvmi_mem.o kvmi.o
 
 kvm-y  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 06f44ce8ed07..04b1d2916a0a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7337,6 +7337,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
break;
 #ifdef CONFIG_KVM_INTROSPECTION
+   case KVM_HC_MEM_MAP:
+   ret = kvmi_host_mem_map(vcpu, (gva_t)a0, (gpa_t)a1, (gpa_t)a2);
+   break;
+   case KVM_HC_MEM_UNMAP:
+   ret = kvmi_host_mem_unmap(vcpu, (gpa_t)a0);
+   break;
case KVM_HC_XEN_HVM_OP:
ret = 0;
if (!kvmi_hypercall_event(vcpu))
diff --git a/include/linux/kvmi.h b/include/linux/kvmi.h
index 10cd6c6412d2..dd980fb0ebcd 100644
--- a/include/linux/kvmi.h
+++ b/include/linux/kvmi.h
@@ -24,6 +24,9 @@ bool kvmi_descriptor_event(struct kvm_vcpu *vcpu, u8 
descriptor, u8 write);
 bool kvmi_tracked_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 bool kvmi_single_step(struct kvm_vcpu *vcpu, gpa_t gpa, int *emulation_type);
 void kvmi_handle_requests(struct kvm_vcpu *vcpu);
+int kvmi_host_mem_map(struct kvm_vcpu *vcpu, gva_t tkn_gva,
+gpa_t req_gpa, gpa_t map_gpa);
+int kvmi_host_mem_unmap(struct kvm_vcpu *vcpu, gpa_t map_gpa);
 void kvmi_stop_ss(struct kvm_vcpu *vcpu);
 bool kvmi_vcpu_enabled_ss(struct kvm_vcpu *vcpu);
 void kvmi_init_emulate(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index ca146ffec061..157f3a401d64 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -10,6 +10,7 @@
 #include "kvmi_int.h"
 #include 
 #include 
+#include 
 
 #define MAX_PAUSE_REQUESTS 1001
 
@@ -320,11 +321,13 @@ static int kvmi_cache_create(void)
 
 int kvmi_init(void)
 {
+   kvmi_mem_init();
return kvmi_cache_create();
 }
 
 void kvmi_uninit(void)
 {
+   kvmi_mem_exit();
kvmi_cache_destroy();
 }
 
@@ -1647,6 +1650,11 @@ int kvmi_cmd_write_physical

[RFC PATCH v6 80/92] kvm: x86: emulate movss xmm, m32

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This is needed in order to be able to support guest code that uses movss to
write into pages that are marked for write tracking.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/emulate.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9d38f892beea..b8a412b8b087 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1184,9 +1184,13 @@ static u8 simd_prefix_to_bytes(const struct 
x86_emulate_ctxt *ctxt,
 
switch (ctxt->b) {
case 0x11:
+   /* movss xmm, m32 */
/* movsd xmm, m64 */
/* movups xmm, m128 */
-   if (simd_prefix == 0xf2) {
+   if (simd_prefix == 0xf3) {
+   bytes = 4;
+   break;
+   } else if (simd_prefix == 0xf2) {
bytes = 8;
break;
}
@@ -4550,7 +4554,7 @@ static const struct gprefix pfx_0f_2b = {
 };
 
 static const struct gprefix pfx_0f_10_0f_11 = {
-   I(Unaligned, em_mov), I(Unaligned, em_mov), I(Unaligned, em_mov), N,
+   I(Unaligned, em_mov), I(Unaligned, em_mov), I(Unaligned, em_mov), 
I(Unaligned, em_mov),
 };
 
 static const struct gprefix pfx_0f_28_0f_29 = {
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 68/92] kvm: x86: emulate a guest page table walk on SPT violations due to A/D bit updates

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

On SPT page faults caused by guest page table walks, use the existing
guest page table walk code to make the necessary adjustments to the A/D
bits and return to guest. This effectively bypasses the x86 emulator
who was making the wrong modifications leading one OS (Windows 8.1 x64)
to triple-fault very early in the boot process with the introspection
enabled.

With introspection disabled, these faults are handled by simply removing
the protection from the affected guest page and returning to guest.

CC: Sean Christopherson 
Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/kvm_host.h  |  2 +-
 arch/x86/include/asm/kvmi_host.h |  6 ++
 arch/x86/kvm/kvmi.c  | 34 +++-
 arch/x86/kvm/mmu.c   | 11 +--
 arch/x86/kvm/x86.c   |  6 +++---
 include/linux/kvmi.h |  3 +++
 virt/kvm/kvmi.c  | 31 +++--
 7 files changed, 84 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2392678dde46..79f3aa6928e5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1425,7 +1425,7 @@ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, 
gva_t gva,
 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
   struct x86_exception *exception);
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
-   struct x86_exception *exception);
+   u32 access, struct x86_exception *exception);
 
 void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/include/asm/kvmi_host.h b/arch/x86/include/asm/kvmi_host.h
index 3f066e7feee2..73369874f3a8 100644
--- a/arch/x86/include/asm/kvmi_host.h
+++ b/arch/x86/include/asm/kvmi_host.h
@@ -16,6 +16,7 @@ bool kvmi_monitored_msr(struct kvm_vcpu *vcpu, u32 msr);
 bool kvmi_cr_event(struct kvm_vcpu *vcpu, unsigned int cr,
   unsigned long old_value, unsigned long *new_value);
 void kvmi_xsetbv_event(struct kvm_vcpu *vcpu);
+bool kvmi_update_ad_flags(struct kvm_vcpu *vcpu);
 
 #else /* CONFIG_KVM_INTROSPECTION */
 
@@ -40,6 +41,11 @@ static inline void kvmi_xsetbv_event(struct kvm_vcpu *vcpu)
 {
 }
 
+static inline bool kvmi_update_ad_flags(struct kvm_vcpu *vcpu)
+{
+   return false;
+}
+
 #endif /* CONFIG_KVM_INTROSPECTION */
 
 #endif /* _ASM_X86_KVMI_HOST_H */
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 9d66c7d6c953..5312f179af9c 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -465,7 +465,7 @@ void kvmi_arch_breakpoint_event(struct kvm_vcpu *vcpu, u64 
gva, u8 insn_len)
u32 action;
u64 gpa;
 
-   gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+   gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, 0, NULL);
 
action = kvmi_msg_send_bp(vcpu, gpa, insn_len);
switch (action) {
@@ -822,6 +822,38 @@ u8 kvmi_arch_relax_page_access(u8 old, u8 new)
return ret;
 }
 
+bool kvmi_update_ad_flags(struct kvm_vcpu *vcpu)
+{
+   struct x86_exception exception = { };
+   struct kvmi *ikvm;
+   bool ret = false;
+   gva_t gva;
+   gpa_t gpa;
+
+   ikvm = kvmi_get(vcpu->kvm);
+   if (!ikvm)
+   return false;
+
+   gva = kvm_mmu_fault_gla(vcpu);
+
+   if (gva == ~0ull) {
+   kvmi_warn_once(ikvm, "%s: cannot perform translation\n",
+  __func__);
+   goto out;
+   }
+
+   gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, PFERR_WRITE_MASK, NULL);
+   if (gpa == UNMAPPED_GVA)
+   gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, 0, );
+
+   ret = (gpa != UNMAPPED_GVA);
+
+out:
+   kvmi_put(vcpu->kvm);
+
+   return ret;
+}
+
 static const struct {
unsigned int allow_bit;
enum kvm_page_track_mode track_mode;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c2f863797495..65b6acba82da 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -40,7 +40,9 @@
 #include 
 #include 
 #include 
+#include 
 
+#include 
 #include 
 #include 
 #include 
@@ -5960,8 +5962,13 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, 
u64 error_code,
 */
if (vcpu->arch.mmu->direct_map &&
(error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
-   kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
-   return 1;
+   if (kvmi_tracked_gfn(vcpu, gpa_to_gfn(cr2))) {
+   if (kvmi_update_ad_flags(vcpu))
+   return 1;
+   } else {
+   kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
+   return 1;
+   }
}
 
/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd10

[RFC PATCH v6 78/92] kvm: x86: add tracepoints for interrupt and exception injections

2019-08-09 Thread Adalbert Lazăr

From: Nicușor Cîțu 

This patch introduces additional tracepoints that are meant to help
in following the flow of interrupts and exceptions queued to a guest
VM. At the same time the kvm_exit tracepoint is enhanced with the
vCPU ID.

One scenario in which these help is debugging lost interrupts due to
a buggy VMEXIT handler.

Signed-off-by: Nicușor Cîțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/svm.c |   9 +++-
 arch/x86/kvm/trace.h   | 118 -
 arch/x86/kvm/vmx/vmx.c |   8 ++-
 arch/x86/kvm/x86.c |  12 +++--
 4 files changed, 116 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cb536a2611f6..00bdf885f9a4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -799,6 +799,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
bool reinject = vcpu->arch.exception.injected;
u32 error_code = vcpu->arch.exception.error_code;
 
+   trace_kvm_inj_exception(vcpu);
+
/*
 * If we are within a nested VM we'd better #VMEXIT and let the guest
 * handle the exception
@@ -5108,6 +5110,8 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
 
+   trace_kvm_inj_nmi(vcpu);
+
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
vcpu->arch.hflags |= HF_NMI_MASK;
set_intercept(svm, INTERCEPT_IRET);
@@ -5133,7 +5137,8 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
 
BUG_ON(!(gif_set(svm)));
 
-   trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
+   trace_kvm_inj_interrupt(vcpu);
+
++vcpu->stat.irq_injections;
 
svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
@@ -5637,6 +5642,8 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = >vmcb->control;
 
+   trace_kvm_cancel_inj(vcpu);
+
control->exit_int_info = control->event_inj;
control->exit_int_info_err = control->event_inj_err;
control->event_inj = 0;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6432d08c7de7..cb47889ddc2c 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -227,6 +227,7 @@ TRACE_EVENT(kvm_exit,
TP_ARGS(exit_reason, vcpu, isa),
 
TP_STRUCT__entry(
+   __field(unsigned int,   vcpu_id )
__field(unsigned int,   exit_reason )
__field(unsigned long,  guest_rip   )
__field(u32,isa )
@@ -235,6 +236,7 @@ TRACE_EVENT(kvm_exit,
),
 
TP_fast_assign(
+   __entry->vcpu_id= vcpu->vcpu_id;
__entry->exit_reason= exit_reason;
__entry->guest_rip  = kvm_rip_read(vcpu);
__entry->isa= isa;
@@ -242,7 +244,8 @@ TRACE_EVENT(kvm_exit,
   &__entry->info2);
),
 
-   TP_printk("reason %s rip 0x%lx info %llx %llx",
+   TP_printk("vcpu %u reason %s rip 0x%lx info %llx %llx",
+__entry->vcpu_id,
 (__entry->isa == KVM_ISA_VMX) ?
 __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) :
 __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS),
@@ -252,19 +255,38 @@ TRACE_EVENT(kvm_exit,
 /*
  * Tracepoint for kvm interrupt injection:
  */
-TRACE_EVENT(kvm_inj_virq,
-   TP_PROTO(unsigned int irq),
-   TP_ARGS(irq),
-
+TRACE_EVENT(kvm_inj_interrupt,
+   TP_PROTO(struct kvm_vcpu *vcpu),
+   TP_ARGS(vcpu),
TP_STRUCT__entry(
-   __field(unsigned int,   irq )
+   __field(__u32, vcpu_id)
+   __field(__u32, nr)
),
-
TP_fast_assign(
-   __entry->irq= irq;
+   __entry->vcpu_id = vcpu->vcpu_id;
+   __entry->nr = vcpu->arch.interrupt.nr;
),
+   TP_printk("vcpu %u irq %u",
+ __entry->vcpu_id,
+ __entry->nr
+   )
+);
 
-   TP_printk("irq %u", __entry->irq)
+/*
+ * Tracepoint for kvm nmi injection:
+ */
+TRACE_EVENT(kvm_inj_nmi,
+   TP_PROTO(struct kvm_vcpu *vcpu),
+   TP_ARGS(vcpu),
+   TP_STRUCT__entry(
+   __field(__u32, vcpu_id)
+   ),
+   TP_fast_assign(
+   __entry->vcpu_id = vcpu->vcpu_id;
+   ),
+   TP_printk("vcpu %u",
+ __entry->vcpu_id
+   )
 );
 
 #define EXS(x) { x##_VECTOR, "#" #x }
@@ -275,28 +297,76 @@ TRACE_EVENT(kvm_inj_virq,
EXS(MF), EXS(AC), EXS(MC)
 
 /*
- * Tracepoint for kvm interrupt injection:
+ * Tracepoint for kvm exception injection:

[RFC PATCH v6 51/92] kvm: introspection: add KVMI_SET_REGISTERS

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This command is allowed only during a vCPU event (an event has been sent
and the vCPU is waiting for the reply). The registers will be set only
when the reply has been received.

Suggested-by: Paolo Bonzini 
Signed-off-by: Mihai Donțu 
Co-developed-by: Mircea Cîrjaliu 
Signed-off-by: Mircea Cîrjaliu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 28 +
 arch/x86/kvm/x86.c | 33 ++
 include/linux/kvm_host.h   |  1 +
 virt/kvm/kvmi.c| 25 ++
 virt/kvm/kvmi_int.h|  5 +
 virt/kvm/kvmi_msg.c| 16 +++
 6 files changed, 108 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index edf81e03ca3c..b6722d071ab7 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -905,6 +905,34 @@ registers, the special registers and the requested set of 
MSRs.
 * -KVM_EAGAIN - the selected vCPU can't be introspected yet
 * -KVM_ENOMEM - not enough memory to allocate the reply
 
+18. KVMI_SET_REGISTERS
+--
+
+:Architectures: x86
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvm_regs;
+
+:Returns:
+
+::
+
+   struct kvmi_error_code
+
+Sets the general purpose registers for the given vCPU. The changes become
+visible to other threads accessing the KVM vCPU structure after the event
+currently being handled is replied to.
+
+:Errors:
+
+* -KVM_EINVAL - the selected vCPU is invalid
+* -KVM_EINVAL - padding is not zero
+* -KVM_EAGAIN - the selected vCPU can't be introspected yet
+
 Events
 ==
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ef29ef7617bf..62d15bbb2332 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8431,6 +8431,39 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, 
struct kvm_regs *regs)
return 0;
 }
 
+/*
+ * Similar to __set_regs() but it does not reset the exceptions
+ */
+void kvm_arch_vcpu_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+   vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
+   vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
+   kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
+   kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
+   kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
+   kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
+   kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
+   kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
+   kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
+   kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
+#ifdef CONFIG_X86_64
+   kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
+   kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
+   kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
+   kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
+   kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
+   kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
+   kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
+   kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+#endif
+
+   kvm_rip_write(vcpu, regs->rip);
+   kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
+
+   kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 {
struct kvm_segment cs;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 09bc06747642..c8eb1a4d997f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -791,6 +791,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
 void kvm_arch_vcpu_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
+void kvm_arch_vcpu_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
  struct kvm_sregs *sregs);
 void kvm_arch_vcpu_get_sregs(struct kvm_vcpu *vcpu,
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index 85de2da3eb7b..a20891d3a2ce 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -1212,6 +1212,31 @@ void kvmi_handle_requests(struct kvm_vcpu *vcpu)
kvmi_put(vcpu->kvm);
 }
 
+void kvmi_post_reply(struct kvm_vcpu *vcpu)
+{
+   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
+
+   if (ivcpu->have_delayed_regs) {
+   kvm_arch_vcpu_set_regs(vcpu, >delayed_regs);
+   ivcpu->have_delayed_regs = false;
+   }
+}
+
+int kvmi_cmd_set_registers(struct kvm_vcpu *vcpu, const struct kvm_regs *regs)
+{
+   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
+
+   if (ivcpu->

[RFC PATCH v6 39/92] KVM: VMX: Introduce SPP user-space IOCTLs

2019-08-09 Thread Adalbert Lazăr

From: Yang Weijiang 

User application, e.g., QEMU or VMI, must initialize SPP
before gets/sets SPP subpages, the dynamic initialization is to
reduce the extra storage cost if the SPP feature is not not used.

Co-developed-by: He Chen 
Signed-off-by: He Chen 
Co-developed-by: Zhang Yi 
Signed-off-by: Zhang Yi 
Co-developed-by: Yang Weijiang 
Signed-off-by: Yang Weijiang 
Message-Id: <20190717133751.12910-7-weijiang.y...@intel.com>
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/x86.c   | 73 
 include/linux/kvm_host.h |  3 ++
 include/uapi/linux/kvm.h |  3 ++
 3 files changed, 79 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b8ae25cb227b..ef29ef7617bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4926,6 +4926,53 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (copy_from_user(, argp, sizeof(hvevfd)))
goto out;
r = kvm_vm_ioctl_hv_eventfd(kvm, );
+   }
+   case KVM_SUBPAGES_GET_ACCESS: {
+   struct kvm_subpage spp_info;
+
+   if (!kvm->arch.spp_active) {
+   r = -ENODEV;
+   goto out;
+   }
+
+   r = -EFAULT;
+   if (copy_from_user(_info, argp, sizeof(spp_info)))
+   goto out;
+
+   r = -EINVAL;
+   if (spp_info.npages == 0 ||
+   spp_info.npages > SUBPAGE_MAX_BITMAP)
+   goto out;
+
+   r = kvm_vm_ioctl_get_subpages(kvm, _info);
+   if (copy_to_user(argp, _info, sizeof(spp_info))) {
+   r = -EFAULT;
+   goto out;
+   }
+   break;
+   }
+   case KVM_SUBPAGES_SET_ACCESS: {
+   struct kvm_subpage spp_info;
+
+   if (!kvm->arch.spp_active) {
+   r = -ENODEV;
+   goto out;
+   }
+
+   r = -EFAULT;
+   if (copy_from_user(_info, argp, sizeof(spp_info)))
+   goto out;
+
+   r = -EINVAL;
+   if (spp_info.npages == 0 ||
+   spp_info.npages > SUBPAGE_MAX_BITMAP)
+   goto out;
+
+   r = kvm_vm_ioctl_set_subpages(kvm, _info);
+   break;
+   }
+   case KVM_INIT_SPP: {
+   r = kvm_vm_ioctl_init_spp(kvm);
break;
}
default:
@@ -9906,6 +9953,32 @@ bool kvm_arch_has_irq_bypass(void)
return kvm_x86_ops->update_pi_irte != NULL;
 }
 
+int kvm_arch_get_subpages(struct kvm *kvm,
+ struct kvm_subpage *spp_info)
+{
+   if (!kvm_x86_ops->get_subpages)
+   return -EINVAL;
+
+   return kvm_x86_ops->get_subpages(kvm, spp_info);
+}
+
+int kvm_arch_set_subpages(struct kvm *kvm,
+ struct kvm_subpage *spp_info)
+{
+   if (!kvm_x86_ops->set_subpages)
+   return -EINVAL;
+
+   return kvm_x86_ops->set_subpages(kvm, spp_info);
+}
+
+int kvm_arch_init_spp(struct kvm *kvm)
+{
+   if (!kvm_x86_ops->init_spp)
+   return -EINVAL;
+
+   return kvm_x86_ops->init_spp(kvm);
+}
+
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
  struct irq_bypass_producer *prod)
 {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0b9a0f546397..ae4106aae16e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -837,6 +837,9 @@ struct kvm_mmu_page *kvm_mmu_get_spp_page(struct kvm_vcpu 
*vcpu,
 int kvm_get_subpages(struct kvm *kvm, struct kvm_subpage *spp_info);
 int kvm_set_subpages(struct kvm *kvm, struct kvm_subpage *spp_info);
 int kvm_init_spp(struct kvm *kvm);
+int kvm_arch_get_subpages(struct kvm *kvm, struct kvm_subpage *spp_info);
+int kvm_arch_set_subpages(struct kvm *kvm, struct kvm_subpage *spp_info);
+int kvm_arch_init_spp(struct kvm *kvm);
 
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
 /*
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ad8f2a3ca72d..86dd57e67539 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1248,6 +1248,9 @@ struct kvm_vfio_spapr_tce {
struct kvm_userspace_memory_region)
 #define KVM_SET_TSS_ADDR  _IO(KVMIO,   0x47)
 #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO,  0x48, __u64)
+#define KVM_SUBPAGES_GET_ACCESS   _IOR(KVMIO,  0x49, __u64)
+#define KVM_SUBPAGES_SET_ACCESS   _IOW(KVMIO,  0x4a, __u64)
+#define KVM_INIT_SPP  _IOW(KVMIO,  0x4b, __u64)
 
 /* enable ucontrol for s390 */
 struct kvm_s390_ucas_mapping {
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 50/92] kvm: introspection: add KVMI_GET_REGISTERS

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This command is used to get kvm_regs and kvm_sregs structures,
plus the list of struct kvm_msrs.

Signed-off-by: Mihai Donțu 
Co-developed-by: Adalbert Lazăr 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 43 
 arch/x86/include/uapi/asm/kvmi.h   | 15 ++
 arch/x86/kvm/kvmi.c| 78 ++
 virt/kvm/kvmi_int.h|  5 ++
 virt/kvm/kvmi_msg.c| 17 +++
 5 files changed, 158 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 558d3eb6007f..edf81e03ca3c 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -862,6 +862,49 @@ The introspection tool should use *KVMI_CONTROL_VM_EVENTS* 
to enable the
 * -KVM_EBUSY  - the selected vCPU has too many queued *KVMI_EVENT_PAUSE_VCPU* 
events
 * -KVM_EPERM  - the *KVMI_EVENT_PAUSE_VCPU* event is disallowed (see 
*KVMI_CONTROL_EVENTS*)
and the introspection tool expects a reply.
+
+17. KVMI_GET_REGISTERS
+--
+
+:Architectures: x86
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_get_registers {
+   __u16 nmsrs;
+   __u16 padding1;
+   __u32 padding2;
+   __u32 msrs_idx[0];
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+   struct kvmi_get_registers_reply {
+   __u32 mode;
+   __u32 padding;
+   struct kvm_regs regs;
+   struct kvm_sregs sregs;
+   struct kvm_msrs msrs;
+   };
+
+For the given vCPU and the ``nmsrs`` sized array of MSRs registers,
+returns the current vCPU mode (in bytes: 2, 4 or 8), the general purpose
+registers, the special registers and the requested set of MSRs.
+
+:Errors:
+
+* -KVM_EINVAL - the selected vCPU is invalid
+* -KVM_EINVAL - one of the indicated MSR-s is invalid
+* -KVM_EINVAL - padding is not zero
+* -KVM_EAGAIN - the selected vCPU can't be introspected yet
+* -KVM_ENOMEM - not enough memory to allocate the reply
+
 Events
 ==
 
diff --git a/arch/x86/include/uapi/asm/kvmi.h b/arch/x86/include/uapi/asm/kvmi.h
index 551f9ed1ed9c..98fb27e1273c 100644
--- a/arch/x86/include/uapi/asm/kvmi.h
+++ b/arch/x86/include/uapi/asm/kvmi.h
@@ -26,4 +26,19 @@ struct kvmi_event_arch {
} msrs;
 };
 
+struct kvmi_get_registers {
+   __u16 nmsrs;
+   __u16 padding1;
+   __u32 padding2;
+   __u32 msrs_idx[0];
+};
+
+struct kvmi_get_registers_reply {
+   __u32 mode;
+   __u32 padding;
+   struct kvm_regs regs;
+   struct kvm_sregs sregs;
+   struct kvm_msrs msrs;
+};
+
 #endif /* _UAPI_ASM_X86_KVMI_H */
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index fa290fbf1f75..a78771b21d2f 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -7,6 +7,25 @@
 #include "x86.h"
 #include "../../../virt/kvm/kvmi_int.h"
 
+static void *alloc_get_registers_reply(const struct kvmi_msg_hdr *msg,
+  const struct kvmi_get_registers *req,
+  size_t *rpl_size)
+{
+   struct kvmi_get_registers_reply *rpl;
+   u16 k, n = req->nmsrs;
+
+   *rpl_size = sizeof(*rpl) + sizeof(rpl->msrs.entries[0]) * n;
+   rpl = kvmi_msg_alloc_check(*rpl_size);
+   if (rpl) {
+   rpl->msrs.nmsrs = n;
+
+   for (k = 0; k < n; k++)
+   rpl->msrs.entries[k].index = req->msrs_idx[k];
+   }
+
+   return rpl;
+}
+
 /*
  * TODO: this can be done from userspace.
  *   - all these registers are sent with struct kvmi_event_arch
@@ -38,6 +57,65 @@ static unsigned int kvmi_vcpu_mode(const struct kvm_vcpu 
*vcpu,
return mode;
 }
 
+static int kvmi_get_registers(struct kvm_vcpu *vcpu, u32 *mode,
+ struct kvm_regs *regs,
+ struct kvm_sregs *sregs,
+ struct kvm_msrs *msrs)
+{
+   struct kvm_msr_entry *msr = msrs->entries;
+   struct kvm_msr_entry *end = msrs->entries + msrs->nmsrs;
+
+   kvm_arch_vcpu_get_regs(vcpu, regs);
+   kvm_arch_vcpu_get_sregs(vcpu, sregs);
+   *mode = kvmi_vcpu_mode(vcpu, sregs);
+
+   for (; msr < end; msr++) {
+   struct msr_data m = {
+   .index = msr->index,
+   .host_initiated = true
+   };
+   int err = kvm_get_msr(vcpu, );
+
+   if (err)
+   return -KVM_EINVAL;
+
+   msr->data = m.data;
+   }
+
+   return 0;
+}
+
+int kvmi_arch_cmd_get_registers(struct kvm_vcpu *vcpu,
+   const struct kvmi_msg_hdr *msg,
+   const struct kvmi_get_registers *req,
+   struct kvmi_get_registers_repl

[RFC PATCH v6 79/92] kvm: x86: emulate movsd xmm, m64

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This is needed in order to be able to support guest code that uses movsd to
write into pages that are marked for write tracking.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/emulate.c | 32 +++-
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 34431cf31f74..9d38f892beea 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1177,6 +1177,27 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
 }
 
+static u8 simd_prefix_to_bytes(const struct x86_emulate_ctxt *ctxt,
+  int simd_prefix)
+{
+   u8 bytes;
+
+   switch (ctxt->b) {
+   case 0x11:
+   /* movsd xmm, m64 */
+   /* movups xmm, m128 */
+   if (simd_prefix == 0xf2) {
+   bytes = 8;
+   break;
+   }
+   /* fallthrough */
+   default:
+   bytes = 16;
+   break;
+   }
+   return bytes;
+}
+
 static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
struct operand *op)
 {
@@ -1187,7 +1208,7 @@ static void decode_register_operand(struct 
x86_emulate_ctxt *ctxt,
 
if (ctxt->d & Sse) {
op->type = OP_XMM;
-   op->bytes = 16;
+   op->bytes = ctxt->op_bytes;
op->addr.xmm = reg;
read_sse_reg(ctxt, >vec_val, reg);
return;
@@ -1238,7 +1259,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
ctxt->d & ByteOp);
if (ctxt->d & Sse) {
op->type = OP_XMM;
-   op->bytes = 16;
+   op->bytes = ctxt->op_bytes;
op->addr.xmm = ctxt->modrm_rm;
read_sse_reg(ctxt, >vec_val, ctxt->modrm_rm);
return rc;
@@ -4529,7 +4550,7 @@ static const struct gprefix pfx_0f_2b = {
 };
 
 static const struct gprefix pfx_0f_10_0f_11 = {
-   I(Unaligned, em_mov), I(Unaligned, em_mov), N, N,
+   I(Unaligned, em_mov), I(Unaligned, em_mov), I(Unaligned, em_mov), N,
 };
 
 static const struct gprefix pfx_0f_28_0f_29 = {
@@ -5097,7 +5118,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void 
*insn, int insn_len)
 {
int rc = X86EMUL_CONTINUE;
int mode = ctxt->mode;
-   int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
+   int def_op_bytes, def_ad_bytes, goffset, simd_prefix = 0;
bool op_prefix = false;
bool has_seg_override = false;
struct opcode opcode;
@@ -5320,7 +5341,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void 
*insn, int insn_len)
ctxt->op_bytes = 4;
 
if (ctxt->d & Sse)
-   ctxt->op_bytes = 16;
+   ctxt->op_bytes = simd_prefix_to_bytes(ctxt,
+ simd_prefix);
else if (ctxt->d & Mmx)
ctxt->op_bytes = 8;
}
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 83/92] kvm: x86: emulate movd xmm, m32

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This is needed in order to be able to support guest code that uses movd to
write into pages that are marked for write tracking.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/emulate.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7c79504e58cd..b42a71653622 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1203,6 +1203,11 @@ static u8 simd_prefix_to_bytes(const struct 
x86_emulate_ctxt *ctxt,
if (simd_prefix == 0x66)
bytes = 8;
break;
+   case 0x7e:
+   /* movd xmm, m32 */
+   if (simd_prefix == 0x66)
+   bytes = 4;
+   break;
default:
break;
}
@@ -4564,6 +4569,10 @@ static const struct gprefix pfx_0f_d6 = {
N, I(0, em_mov), N, N,
 };
 
+static const struct gprefix pfx_0f_7e = {
+   N, I(0, em_mov), N, N,
+};
+
 static const struct gprefix pfx_0f_2b = {
ID(0, _dual_0f_2b), ID(0, _dual_0f_2b), N, N,
 };
@@ -4823,7 +4832,8 @@ static const struct opcode twobyte_table[256] = {
N, N, N, N,
N, N, N, N,
N, N, N, N,
-   N, N, N, GP(SrcReg | DstMem | ModRM | Mov, _0f_6f_0f_7f),
+   N, N, GP(ModRM | SrcReg | DstMem | GPRModRM | Mov | Sse, _0f_7e),
+   GP(SrcReg | DstMem | ModRM | Mov, _0f_6f_0f_7f),
/* 0x80 - 0x8F */
X16(D(SrcImm | NearBranch)),
/* 0x90 - 0x9F */
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 67/92] kvm: introspection: use single stepping on unimplemented instructions

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

On emulation failures, we notify the introspection tool for read/write
operations if needed. Unless it responds with RETRY (to re-enter guest),
we continue single stepping the vCPU.

Signed-off-by: Mihai Donțu 
Co-developed-by: Nicușor Cîțu 
Signed-off-by: Nicușor Cîțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/kvm_host.h |  5 +++
 arch/x86/include/asm/vmx.h  |  2 ++
 arch/x86/kvm/kvmi.c | 21 
 arch/x86/kvm/mmu.c  |  5 +++
 arch/x86/kvm/svm.c  |  8 +
 arch/x86/kvm/vmx/vmx.c  | 13 ++--
 arch/x86/kvm/x86.c  | 57 -
 include/linux/kvmi.h|  4 +++
 virt/kvm/kvmi.c | 56 
 virt/kvm/kvmi_int.h |  1 +
 10 files changed, 169 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 60e2c298d469..2392678dde46 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -770,6 +770,9 @@ struct kvm_vcpu_arch {
/* set at EPT violation at this point */
unsigned long exit_qualification;
 
+   /* #PF translated error code from EPT/NPT exit reason */
+   u64 error_code;
+
/* pv related host specific info */
struct {
bool pv_unhalted;
@@ -1016,6 +1019,7 @@ struct kvm_x86_ops {
void (*msr_intercept)(struct kvm_vcpu *vcpu, unsigned int msr,
bool enable);
bool (*desc_intercept)(struct kvm_vcpu *vcpu, bool enable);
+   u64 (*fault_gla)(struct kvm_vcpu *vcpu);
void (*set_mtf)(struct kvm_vcpu *vcpu, bool enable);
void (*cr3_write_exiting)(struct kvm_vcpu *vcpu, bool enable);
bool (*nested_pagefault)(struct kvm_vcpu *vcpu);
@@ -1627,6 +1631,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
 
 void kvm_arch_msr_intercept(struct kvm_vcpu *vcpu, unsigned int msr,
bool enable);
+u64 kvm_mmu_fault_gla(struct kvm_vcpu *vcpu);
 bool kvm_mmu_nested_pagefault(struct kvm_vcpu *vcpu);
 bool kvm_spt_fault(struct kvm_vcpu *vcpu);
 void kvm_set_mtf(struct kvm_vcpu *vcpu, bool enable);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 11ca64ced578..bc0f5bbd692c 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -538,6 +538,7 @@ struct vmx_msr_entry {
 #define EPT_VIOLATION_READABLE_BIT 3
 #define EPT_VIOLATION_WRITABLE_BIT 4
 #define EPT_VIOLATION_EXECUTABLE_BIT   5
+#define EPT_VIOLATION_GLA_VALID_BIT7
 #define EPT_VIOLATION_GVA_TRANSLATED_BIT 8
 #define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT)
 #define EPT_VIOLATION_ACC_WRITE(1 << 
EPT_VIOLATION_ACC_WRITE_BIT)
@@ -545,6 +546,7 @@ struct vmx_msr_entry {
 #define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT)
 #define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT)
 #define EPT_VIOLATION_EXECUTABLE   (1 << EPT_VIOLATION_EXECUTABLE_BIT)
+#define EPT_VIOLATION_GLA_VALID(1 << 
EPT_VIOLATION_GLA_VALID_BIT)
 #define EPT_VIOLATION_GVA_TRANSLATED   (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
 
 /*
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index f0ab4bd9eb37..9d66c7d6c953 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -759,6 +759,27 @@ int kvmi_arch_cmd_control_cr(struct kvm_vcpu *vcpu,
return 0;
 }
 
+bool is_ud2_instruction(struct kvm_vcpu *vcpu, int *emulation_type)
+{
+   u8 ud2[] = {0x0F, 0x0B};
+   u8 insn_len = vcpu->arch.emulate_ctxt.fetch.ptr -
+ vcpu->arch.emulate_ctxt.fetch.data;
+
+   if (insn_len != sizeof(ud2))
+   return false;
+
+   if (memcmp(vcpu->arch.emulate_ctxt.fetch.data, ud2, insn_len))
+   return false;
+
+   /* Do not reexecute the UD2 instruction, else we might enter to an
+* endless emulation loop. Let the emulator fall down through the
+* handle_emulation_failure() which shall inject the #UD exception.
+*/
+   *emulation_type &= ~EMULTYPE_ALLOW_RETRY;
+
+   return true;
+}
+
 void kvmi_arch_start_single_step(struct kvm_vcpu *vcpu)
 {
kvm_set_mtf(vcpu, true);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0b859b1797f6..c2f863797495 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -6667,6 +6667,11 @@ void kvm_mmu_module_exit(void)
mmu_audit_disable();
 }
 
+u64 kvm_mmu_fault_gla(struct kvm_vcpu *vcpu)
+{
+   return kvm_x86_ops->fault_gla(vcpu);
+}
+
 bool kvm_mmu_nested_pagefault(struct kvm_vcpu *vcpu)
 {
return kvm_x86_ops->nested_pagefault(vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3481c0247680..cb536a2611f6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2675,6 +2675,8 @@ static int pf

[RFC PATCH v6 21/92] kvm: page track: add track_create_slot() callback

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This is used to add page access notifications as soon as a slot appears.

CC: Xiao Guangrong 
Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/kvm_page_track.h |  5 -
 arch/x86/kvm/page_track.c | 18 --
 arch/x86/kvm/x86.c|  2 +-
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_page_track.h 
b/arch/x86/include/asm/kvm_page_track.h
index 172f9749dbb2..18a94d180485 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -34,6 +34,9 @@ struct kvm_page_track_notifier_node {
 */
void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
int bytes, struct kvm_page_track_notifier_node 
*node);
+   void (*track_create_slot)(struct kvm *kvm, struct kvm_memory_slot *slot,
+ unsigned long npages,
+ struct kvm_page_track_notifier_node *node);
/*
 * It is called when memory slot is being moved or removed
 * users can drop write-protection for the pages in that memory slot
@@ -51,7 +54,7 @@ void kvm_page_track_cleanup(struct kvm *kvm);
 
 void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
 struct kvm_memory_slot *dont);
-int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+int kvm_page_track_create_memslot(struct kvm *kvm, struct kvm_memory_slot 
*slot,
  unsigned long npages);
 
 void kvm_slot_page_track_add_page(struct kvm *kvm,
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 3052a59a3065..db5b906876bb 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -34,10 +34,13 @@ void kvm_page_track_free_memslot(struct kvm_memory_slot 
*free,
}
 }
 
-int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+int kvm_page_track_create_memslot(struct kvm *kvm, struct kvm_memory_slot 
*slot,
  unsigned long npages)
 {
-   int  i;
+   struct kvm_page_track_notifier_head *head;
+   struct kvm_page_track_notifier_node *n;
+   int idx;
+   int i;
 
for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
slot->arch.gfn_track[i] =
@@ -47,6 +50,17 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot 
*slot,
goto track_free;
}
 
+   head = >arch.track_notifier_head;
+
+   if (hlist_empty(>track_notifier_list))
+   return 0;
+
+   idx = srcu_read_lock(>track_srcu);
+   hlist_for_each_entry_rcu(n, >track_notifier_list, node)
+   if (n->track_create_slot)
+   n->track_create_slot(kvm, slot, npages, n);
+   srcu_read_unlock(>track_srcu, idx);
+
return 0;
 
 track_free:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 30cf0d162aa8..f66db9473ea3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9350,7 +9350,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct 
kvm_memory_slot *slot,
}
}
 
-   if (kvm_page_track_create_memslot(slot, npages))
+   if (kvm_page_track_create_memslot(kvm, slot, npages))
goto out_free;
 
return 0;
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 75/92] kvm: x86: disable gpa_available optimization in emulator_read_write_onepage()

2019-08-09 Thread Adalbert Lazăr

If the EPT violation was caused by an execute restriction imposed by the
introspection tool, gpa_available will point to the instruction pointer,
not the to the read/write location that has to be used to emulate the
current instruction.

This optimization should be disabled only when the VM is introspected,
not just because the introspection subsystem is present.

Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 965c4f0108eb..3975331230b9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5532,7 +5532,7 @@ static int emulator_read_write_onepage(unsigned long 
addr, void *val,
 * operation using rep will only have the initial GPA from the NPF
 * occurred.
 */
-   if (vcpu->arch.gpa_available &&
+   if (vcpu->arch.gpa_available && !kvmi_is_present() &&
emulator_can_use_gpa(ctxt) &&
(addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
gpa = vcpu->arch.gpa_val;
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 86/92] kvm: x86: emulate xorpd xmm2/m128, xmm1

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This adds support for xorpd xmm2/m128, xmm1.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/emulate.c | 19 ++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 287d3751675d..28aac552b34b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1178,6 +1178,22 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
 }
 
+static int em_xorpd(struct x86_emulate_ctxt *ctxt)
+{
+   const sse128_t *src = >src.vec_val;
+   sse128_t *dst = >dst.vec_val;
+   sse128_t xmm0;
+
+   asm volatile("movdqu %%xmm0, %0\n"
+"movdqu %1, %%xmm0\n"
+"xorpd %2, %%xmm0\n"
+"movdqu %%xmm0, %1\n"
+"movdqu %0, %%xmm0"
+: "+m"(xmm0), "+m"(*dst) : "m"(*src));
+
+   return X86EMUL_CONTINUE;
+}
+
 static u8 simd_prefix_to_bytes(const struct x86_emulate_ctxt *ctxt,
   int simd_prefix)
 {
@@ -4831,7 +4847,8 @@ static const struct opcode twobyte_table[256] = {
/* 0x40 - 0x4F */
X16(D(DstReg | SrcMem | ModRM)),
/* 0x50 - 0x5F */
-   N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+   N, N, N, N, N, N, N, I(SrcMem | DstReg | ModRM | Unaligned | Sse, 
em_xorpd),
+   N, N, N, N, N, N, N, N,
/* 0x60 - 0x6F */
N, N, N, N,
N, N, N, N,
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 17/92] kvm: introspection: introduce event actions

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

All vCPU event replies contains the action requested by the introspection
tool, which can be one of the following:

  * KVMI_EVENT_ACTION_CONTINUE
  * KVMI_EVENT_ACTION_RETRY
  * KVMI_EVENT_ACTION_CRASH

The CONTINUE action can be seen as "continue with the old KVM code
path", while the RETRY action as "re-enter guest".

Note: KVMI_EVENT_UNHOOK, a VM event, doesn't have/need a reply.

Suggested-by: Paolo Bonzini 
Signed-off-by: Mihai Donțu 
Co-developed-by: Adalbert Lazăr 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 10 
 include/uapi/linux/kvmi.h  |  4 +++
 kernel/signal.c|  1 +
 virt/kvm/kvmi.c| 40 ++
 4 files changed, 55 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index e7d9a3816e00..1ea4be0d5a45 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -482,4 +482,14 @@ with two common structures::
__u32 padding2;
};
 
+All events accept the KVMI_EVENT_ACTION_CRASH action, which stops the
+guest ungracefully but as soon as possible.
+
+Most of the events accept the KVMI_EVENT_ACTION_CONTINUE action, which
+lets the instruction that caused the event to continue (unless specified
+otherwise).
+
+Some of the events accept the KVMI_EVENT_ACTION_RETRY action, to continue
+by re-entering the guest.
+
 Specific data can follow these common structures.
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index dda2ae352611..ccf2239b5db4 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -66,6 +66,10 @@ enum {
KVMI_NUM_EVENTS
 };
 
+#define KVMI_EVENT_ACTION_CONTINUE  0
+#define KVMI_EVENT_ACTION_RETRY 1
+#define KVMI_EVENT_ACTION_CRASH 2
+
 #define KVMI_MSG_SIZE (4096 - sizeof(struct kvmi_msg_hdr))
 
 struct kvmi_msg_hdr {
diff --git a/kernel/signal.c b/kernel/signal.c
index 57b7771e20d7..9befbfaaa710 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1413,6 +1413,7 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, 
struct pid *pid)
 */
}
 }
+EXPORT_SYMBOL(kill_pid_info);
 
 static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
 {
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index 3cc7bb035796..0d3560b74f2d 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -511,6 +511,46 @@ void kvmi_destroy_vm(struct kvm *kvm)
wait_for_completion_killable(>kvmi_completed);
 }
 
+static int kvmi_vcpu_kill(int sig, struct kvm_vcpu *vcpu)
+{
+   int err = -ESRCH;
+   struct pid *pid;
+   struct kernel_siginfo siginfo[1] = {};
+
+   rcu_read_lock();
+   pid = rcu_dereference(vcpu->pid);
+   if (pid)
+   err = kill_pid_info(sig, siginfo, pid);
+   rcu_read_unlock();
+
+   return err;
+}
+
+static void kvmi_vm_shutdown(struct kvm *kvm)
+{
+   int i;
+   struct kvm_vcpu *vcpu;
+
+   kvm_for_each_vcpu(i, vcpu, kvm)
+   kvmi_vcpu_kill(SIGTERM, vcpu);
+}
+
+void kvmi_handle_common_event_actions(struct kvm_vcpu *vcpu, u32 action,
+ const char *str)
+{
+   struct kvm *kvm = vcpu->kvm;
+
+   switch (action) {
+   case KVMI_EVENT_ACTION_CRASH:
+   kvmi_vm_shutdown(kvm);
+   break;
+
+   default:
+   kvmi_err(IKVM(kvm), "Unsupported action %d for event %s\n",
+action, str);
+   }
+}
+
 void kvmi_run_jobs(struct kvm_vcpu *vcpu)
 {
struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 22/92] kvm: x86: provide all page tracking hooks with the guest virtual address

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This is needed because the emulator calls the page tracking code
irrespective of the current VMEXIT reason or available information.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/kvm_host.h   |  2 +-
 arch/x86/include/asm/kvm_page_track.h |  9 +
 arch/x86/kvm/mmu.c|  2 +-
 arch/x86/kvm/page_track.c |  6 +++---
 arch/x86/kvm/x86.c| 16 
 drivers/gpu/drm/i915/gvt/kvmgt.c  |  2 +-
 6 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 67ed934ca124..2d6bde6fa59f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1263,7 +1263,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned 
int kvm_nr_mmu_pages);
 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
 bool pdptrs_changed(struct kvm_vcpu *vcpu);
 
-int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
  const void *val, int bytes);
 
 struct kvm_irq_mask_notifier {
diff --git a/arch/x86/include/asm/kvm_page_track.h 
b/arch/x86/include/asm/kvm_page_track.h
index 18a94d180485..0492a85f3a44 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -32,8 +32,9 @@ struct kvm_page_track_notifier_node {
 * @bytes: the written length.
 * @node: this node
 */
-   void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
-   int bytes, struct kvm_page_track_notifier_node 
*node);
+   void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
+   const u8 *new, int bytes,
+   struct kvm_page_track_notifier_node *node);
void (*track_create_slot)(struct kvm *kvm, struct kvm_memory_slot *slot,
  unsigned long npages,
  struct kvm_page_track_notifier_node *node);
@@ -72,7 +73,7 @@ kvm_page_track_register_notifier(struct kvm *kvm,
 void
 kvm_page_track_unregister_notifier(struct kvm *kvm,
   struct kvm_page_track_notifier_node *n);
-void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
- int bytes);
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
+ const u8 *new, int bytes);
 void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
 #endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f2d1d230d5b8..9898d863b6b6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5222,7 +5222,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, 
gpa_t gpa, int *nspte)
return spte;
 }
 
-static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
  const u8 *new, int bytes,
  struct kvm_page_track_notifier_node *node)
 {
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index db5b906876bb..ff7defb4a1d2 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -236,8 +236,8 @@ EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
  * The node should figure out if the written page is the one that node is
  * interested in by itself.
  */
-void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
- int bytes)
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
+ const u8 *new, int bytes)
 {
struct kvm_page_track_notifier_head *head;
struct kvm_page_track_notifier_node *n;
@@ -251,7 +251,7 @@ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, 
const u8 *new,
idx = srcu_read_lock(>track_srcu);
hlist_for_each_entry_rcu(n, >track_notifier_list, node)
if (n->track_write)
-   n->track_write(vcpu, gpa, new, bytes, n);
+   n->track_write(vcpu, gpa, gva, new, bytes, n);
srcu_read_unlock(>track_srcu, idx);
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f66db9473ea3..d3d159986243 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5281,7 +5281,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, 
unsigned long gva,
return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
 }
 
-int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
const void *val, int bytes)
 {
int ret;
@@ -5289,14 +5289,14 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t 
gpa,
ret = kvm_vcpu_write_guest(vcp

[RFC PATCH v6 12/92] kvm: introspection: add a jobs list to every introspected vCPU

2019-08-09 Thread Adalbert Lazăr

Every vCPU has a lock-protected list in which (mostly) the receiving
worker places the jobs to be done by the vCPU once it is kicked
(KVM_REQ_INTROSPECTION) out of guest.

A job is defined by a "do" function, a pointer (context) and a "free"
function.

Co-developed-by: Nicușor Cîțu 
Signed-off-by: Nicușor Cîțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/kvm_host.h |   1 +
 virt/kvm/kvmi.c | 102 +++-
 virt/kvm/kvmi_int.h |   9 +++
 3 files changed, 111 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 180373360e34..67ed934ca124 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -78,6 +78,7 @@
 #define KVM_REQ_HV_STIMER  KVM_ARCH_REQ(22)
 #define KVM_REQ_LOAD_EOI_EXITMAP   KVM_ARCH_REQ(23)
 #define KVM_REQ_GET_VMCS12_PAGES   KVM_ARCH_REQ(24)
+#define KVM_REQ_INTROSPECTION  KVM_ARCH_REQ(25)
 
 #define CR0_RESERVED_BITS   \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index 860574039221..07ebd1c629b0 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -11,6 +11,9 @@
 #include 
 
 static struct kmem_cache *msg_cache;
+static struct kmem_cache *job_cache;
+
+static void kvmi_abort_events(struct kvm *kvm);
 
 void *kvmi_msg_alloc(void)
 {
@@ -34,14 +37,19 @@ static void kvmi_cache_destroy(void)
 {
kmem_cache_destroy(msg_cache);
msg_cache = NULL;
+   kmem_cache_destroy(job_cache);
+   job_cache = NULL;
 }
 
 static int kvmi_cache_create(void)
 {
+   job_cache = kmem_cache_create("kvmi_job",
+ sizeof(struct kvmi_job),
+ 0, SLAB_ACCOUNT, NULL);
msg_cache = kmem_cache_create("kvmi_msg", KVMI_MSG_SIZE_ALLOC,
  4096, SLAB_ACCOUNT, NULL);
 
-   if (!msg_cache) {
+   if (!msg_cache || !job_cache) {
kvmi_cache_destroy();
 
return -1;
@@ -80,6 +88,53 @@ static bool alloc_kvmi(struct kvm *kvm, const struct 
kvm_introspection *qemu)
return true;
 }
 
+static int __kvmi_add_job(struct kvm_vcpu *vcpu,
+ void (*fct)(struct kvm_vcpu *vcpu, void *ctx),
+ void *ctx, void (*free_fct)(void *ctx))
+{
+   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
+   struct kvmi_job *job;
+
+   job = kmem_cache_zalloc(job_cache, GFP_KERNEL);
+   if (unlikely(!job))
+   return -ENOMEM;
+
+   INIT_LIST_HEAD(>link);
+   job->fct = fct;
+   job->ctx = ctx;
+   job->free_fct = free_fct;
+
+   spin_lock(>job_lock);
+   list_add_tail(>link, >job_list);
+   spin_unlock(>job_lock);
+
+   return 0;
+}
+
+int kvmi_add_job(struct kvm_vcpu *vcpu,
+void (*fct)(struct kvm_vcpu *vcpu, void *ctx),
+void *ctx, void (*free_fct)(void *ctx))
+{
+   int err;
+
+   err = __kvmi_add_job(vcpu, fct, ctx, free_fct);
+
+   if (!err) {
+   kvm_make_request(KVM_REQ_INTROSPECTION, vcpu);
+   kvm_vcpu_kick(vcpu);
+   }
+
+   return err;
+}
+
+static void kvmi_free_job(struct kvmi_job *job)
+{
+   if (job->free_fct)
+   job->free_fct(job->ctx);
+
+   kmem_cache_free(job_cache, job);
+}
+
 static bool alloc_ivcpu(struct kvm_vcpu *vcpu)
 {
struct kvmi_vcpu *ivcpu;
@@ -88,6 +143,9 @@ static bool alloc_ivcpu(struct kvm_vcpu *vcpu)
if (!ivcpu)
return false;
 
+   INIT_LIST_HEAD(>job_list);
+   spin_lock_init(>job_lock);
+
vcpu->kvmi = ivcpu;
 
return true;
@@ -101,6 +159,27 @@ struct kvmi * __must_check kvmi_get(struct kvm *kvm)
return NULL;
 }
 
+static void kvmi_clear_vcpu_jobs(struct kvm *kvm)
+{
+   int i;
+   struct kvm_vcpu *vcpu;
+   struct kvmi_job *cur, *next;
+
+   kvm_for_each_vcpu(i, vcpu, kvm) {
+   struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
+
+   if (!ivcpu)
+   continue;
+
+   spin_lock(>job_lock);
+   list_for_each_entry_safe(cur, next, >job_list, link) {
+   list_del(>link);
+   kvmi_free_job(cur);
+   }
+   spin_unlock(>job_lock);
+   }
+}
+
 static void kvmi_destroy(struct kvm *kvm)
 {
struct kvm_vcpu *vcpu;
@@ -118,6 +197,7 @@ static void kvmi_destroy(struct kvm *kvm)
 static void kvmi_release(struct kvm *kvm)
 {
kvmi_sock_put(IKVM(kvm));
+   kvmi_clear_vcpu_jobs(kvm);
kvmi_destroy(kvm);
 
complete(>kvmi_completed);
@@ -179,6 +259,13 @@ static void kvmi_end_introspection(struct kvmi *ikvm)
/* Signal QEMU which

[RFC PATCH v6 10/92] kvm: introspection: add KVMI_CONTROL_VM_EVENTS

2019-08-09 Thread Adalbert Lazăr

No introspection event (neither VM event, nor vCPU event) will be sent
to the introspection tool unless enabled/requested.

This command enables/disables VM events. For now, these events are:

  * KVMI_EVENT_UNHOOK
  * KVMI_EVENT_CREATE_VCPU

The first event is initiated by userspace/QEMU in order to give the
introspection tool a chance to remove its hooks in the event of
pause/suspend/migrate.

The second event is actually a vCPU event, added to cover the case when
the introspection tool has paused all vCPUs and userspace hotplugs (and
starts) another one. The event is controlled by this command because its
status (enabled/disabled) is kept in the VM related structures (as opposed
to vCPU related structures). I didn't had a better idea. Not to mention
that, the vCPU events are controlled with commands like "enable/disable
event X for vCPU Y" and Y is _unknown_ for X=KVMI_EVENT_CREATE_VCPU.

Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 39 ++
 include/uapi/linux/kvmi.h  |  7 ++
 virt/kvm/kvmi.c| 11 +
 virt/kvm/kvmi_int.h|  3 +++
 virt/kvm/kvmi_msg.c| 23 ++
 5 files changed, 83 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 2fbe7c28e4f1..a660def20b23 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -380,3 +380,42 @@ This command is always allowed.
};
 
 Returns the number of online vCPUs.
+
+6. KVMI_CONTROL_VM_EVENTS
+-
+
+:Architectures: all
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_control_vm_events {
+   __u16 event_id;
+   __u8 enable;
+   __u8 padding1;
+   __u32 padding2;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code
+
+Enables/disables VM introspection events. This command can be used with
+the following events::
+
+   KVMI_EVENT_CREATE_VCPU
+   KVMI_EVENT_UNHOOK
+
+When an event is enabled, the introspection tool is notified and,
+in almost all cases, it must reply with: continue, retry, crash, etc.
+(see **Events** below).
+
+:Errors:
+
+* -KVM_EINVAL - the event ID is invalid
+* -KVM_EINVAL - padding is not zero
+* -KVM_EPERM - the access is restricted by the host
+
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index 367c8ec28f75..ff35faabb7ed 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -107,4 +107,11 @@ struct kvmi_get_guest_info_reply {
__u32 padding[3];
 };
 
+struct kvmi_control_vm_events {
+   __u16 event_id;
+   __u8 enable;
+   __u8 padding1;
+   __u32 padding2;
+};
+
 #endif /* _UAPI__LINUX_KVMI_H */
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index dc1bb8326763..961e6cc13fb6 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -338,6 +338,17 @@ void kvmi_destroy_vm(struct kvm *kvm)
wait_for_completion_killable(>kvmi_completed);
 }
 
+int kvmi_cmd_control_vm_events(struct kvmi *ikvm, unsigned int event_id,
+  bool enable)
+{
+   if (enable)
+   set_bit(event_id, ikvm->vm_ev_mask);
+   else
+   clear_bit(event_id, ikvm->vm_ev_mask);
+
+   return 0;
+}
+
 int kvmi_ioctl_unhook(struct kvm *kvm, bool force_reset)
 {
struct kvmi *ikvm;
diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h
index 157f765fb34d..84ba43bd9a9d 100644
--- a/virt/kvm/kvmi_int.h
+++ b/virt/kvm/kvmi_int.h
@@ -85,6 +85,7 @@ struct kvmi {
 
DECLARE_BITMAP(cmd_allow_mask, KVMI_NUM_COMMANDS);
DECLARE_BITMAP(event_allow_mask, KVMI_NUM_EVENTS);
+   DECLARE_BITMAP(vm_ev_mask, KVMI_NUM_EVENTS);
 
bool cmd_reply_disabled;
 };
@@ -99,5 +100,7 @@ bool kvmi_msg_process(struct kvmi *ikvm);
 void *kvmi_msg_alloc(void);
 void *kvmi_msg_alloc_check(size_t size);
 void kvmi_msg_free(void *addr);
+int kvmi_cmd_control_vm_events(struct kvmi *ikvm, unsigned int event_id,
+  bool enable);
 
 #endif
diff --git a/virt/kvm/kvmi_msg.c b/virt/kvm/kvmi_msg.c
index cf8a120b0eae..a55c9e35be36 100644
--- a/virt/kvm/kvmi_msg.c
+++ b/virt/kvm/kvmi_msg.c
@@ -12,6 +12,7 @@ static const char *const msg_IDs[] = {
[KVMI_CHECK_COMMAND] = "KVMI_CHECK_COMMAND",
[KVMI_CHECK_EVENT]   = "KVMI_CHECK_EVENT",
[KVMI_CONTROL_CMD_RESPONSE]  = "KVMI_CONTROL_CMD_RESPONSE",
+   [KVMI_CONTROL_VM_EVENTS] = "KVMI_CONTROL_VM_EVENTS",
[KVMI_GET_GUEST_INFO]= "KVMI_GET_GUEST_INFO",
[KVMI_GET_VERSION]   = "KVMI_GET_VERSION",
 };
@@ -226,6 +227,27 @@ static int handle_get_guest_info(struct kvmi *ikvm,
return kvmi_msg_vm_maybe_reply(ikvm, msg, 0, , sizeof(rpl));
 }
 
+static int handle_control_vm_events(str

[RFC PATCH v6 34/92] Documentation: Introduce EPT based Subpage Protection

2019-08-09 Thread Adalbert Lazăr

From: Yang Weijiang 

Co-developed-by: yi.z.zh...@linux.intel.com
Signed-off-by: yi.z.zh...@linux.intel.com
Co-developed-by: Yang Weijiang 
Signed-off-by: Yang Weijiang 
Message-Id: <20190717133751.12910-2-weijiang.y...@intel.com>
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/spp_kvm.txt | 173 ++
 1 file changed, 173 insertions(+)
 create mode 100644 Documentation/virtual/kvm/spp_kvm.txt

diff --git a/Documentation/virtual/kvm/spp_kvm.txt 
b/Documentation/virtual/kvm/spp_kvm.txt
new file mode 100644
index ..bdf94922cba9
--- /dev/null
+++ b/Documentation/virtual/kvm/spp_kvm.txt
@@ -0,0 +1,173 @@
+EPT-Based Sub-Page Protection (SPP) for KVM
+
+
+1.Overview
+  EPT-based Sub-Page Protection(SPP) allows VMM to specify
+  fine-grained(128byte per sub-page) write-protection for guest physical
+  memory. When it's enabled, the CPU enforces write-access permission
+  for the sub-pages within a 4KB page, if corresponding bit is set in
+  permission vector, write to sub-page region is allowed, otherwise,
+  it's prevented with a EPT violation.
+
+2.SPP Operation
+  Sub-Page Protection Table (SPPT) is introduced to manage sub-page
+  write-access permission.
+
+  It is active when:
+  a) large paging is disabled on host side.
+  b) "sub-page write protection" VM-execution control is 1.
+  c) SPP is initialized with KVM_INIT_SPP ioctl successfully.
+  d) Sub-page permissions are set with KVM_SUBPAGES_SET_ACCESS ioctl
+ successfully. see below sections for details.
+
+  __
+
+  How SPP hardware works:
+  __
+
+  Guest write access --> GPA --> Walk EPT --> EPT leaf entry -|
+  |---|
+  |-> if VMexec_control.spp && ept_leaf_entry.spp_bit (bit 61)
+   |
+   |->  --> EPT legacy behavior
+   |
+   |
+   |->   --> if ept_leaf_entry.writable
+|
+|->   --> Ignore SPP
+|
+|->  --> GPA --> Walk SPP 4-level table--|
+|
+  |<--get-the-SPPT-point-from-VMCS-filed-<--|
+  |
+  Walk SPP L4E table
+  |
+  |---> if-entry-misconfiguration >---|---<-|
+   |  | |
+  else| |
+   |  | |
+   |   |--SPP VMexit<-| |
+   |   ||
+   |   |-> exit_qualification & sppt_misconfig --> sppt misconfig   |
+   |   ||
+   |   |-> exit_qualification & sppt_miss --> sppt miss |
+   |---||
+   ||
+  walk SPPT L3E--|--> if-entry-misconfiguration>|
+ |  |
+else|
+ |  |
+ |  |
+  walk SPPT L2E --|--> if-entry-misconfiguration>---|
+  | |
+ else   |
+  | |
+  | |
+   walk SPPT L1E --|-> if-entry-misconfiguration--->|
+   |
+ else
+   |
+   |-> if sub-page writable
+   |->   allow, write access
+   |->  disallow, EPT violation
+  
__
+
+3.IOCTL Interfaces
+
+KVM_INIT_SPP:
+Allocate storage for sub-page permission vectors and SPPT root page.
+
+KVM_SUBPAGES_GET_ACCESS:
+Get sub-page write permission vectors for given continuous guest pages.
+
+KVM_SUBPAGES_SET_ACCESS
+Set sub-pages write permission vectors for given continuous guest pages.
+
+/* for KVM_SUBPAGES_GET_ACCESS and KVM_SUBPAGES_SET_A

[RFC PATCH v6 35/92] KVM: VMX: Add control flags for SPP enabling

2019-08-09 Thread Adalbert Lazăr

From: Yang Weijiang 

Check SPP capability in MSR_IA32_VMX_PROCBASED_CTLS2, its 23-bit
indicates SPP support. Mark SPP bit in CPU capabilities bitmap if
it's supported.

Co-developed-by: He Chen 
Signed-off-by: He Chen 
Co-developed-by: Zhang Yi 
Signed-off-by: Zhang Yi 
Co-developed-by: Yang Weijiang 
Signed-off-by: Yang Weijiang 
Message-Id: <20190717133751.12910-3-weijiang.y...@intel.com>
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/vmx.h |  1 +
 arch/x86/kernel/cpu/intel.c|  4 
 arch/x86/kvm/vmx/capabilities.h|  5 +
 arch/x86/kvm/vmx/vmx.c | 10 ++
 5 files changed, 21 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index 6d6122524711..183b4fd864c6 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -228,6 +228,7 @@
 #define X86_FEATURE_FLEXPRIORITY   ( 8*32+ 2) /* Intel FlexPriority */
 #define X86_FEATURE_EPT( 8*32+ 3) /* Intel Extended 
Page Table */
 #define X86_FEATURE_VPID   ( 8*32+ 4) /* Intel Virtual Processor 
ID */
+#define X86_FEATURE_SPP( 8*32+ 5) /* Intel EPT-based 
Sub-Page Write Protection */
 
 #define X86_FEATURE_VMMCALL( 8*32+15) /* Prefer VMMCALL to VMCALL 
*/
 #define X86_FEATURE_XENPV  ( 8*32+16) /* "" Xen paravirtual guest 
*/
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 4e4133e86484..a2c9e18e0ad7 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -81,6 +81,7 @@
 #define SECONDARY_EXEC_XSAVES  0x0010
 #define SECONDARY_EXEC_PT_USE_GPA  0x0100
 #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x0040
+#define SECONDARY_EXEC_ENABLE_SPP  0x0080
 #define SECONDARY_EXEC_TSC_SCALING  0x0200
 
 #define PIN_BASED_EXT_INTR_MASK 0x0001
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fc3c07fe7df5..b55156ce16da 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -476,6 +476,7 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
 #define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x0002
 #define X86_VMX_FEATURE_PROC_CTLS2_VPID0x0020
 #define x86_VMX_FEATURE_EPT_CAP_AD 0x0020
+#define X86_VMX_FEATURE_PROC_CTLS2_SPP 0x0080
 
u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
u32 msr_vpid_cap, msr_ept_cap;
@@ -486,6 +487,7 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
clear_cpu_cap(c, X86_FEATURE_EPT);
clear_cpu_cap(c, X86_FEATURE_VPID);
clear_cpu_cap(c, X86_FEATURE_EPT_AD);
+   clear_cpu_cap(c, X86_FEATURE_SPP);
 
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
msr_ctl = vmx_msr_high | vmx_msr_low;
@@ -509,6 +511,8 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
}
if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
set_cpu_cap(c, X86_FEATURE_VPID);
+   if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_SPP)
+   set_cpu_cap(c, X86_FEATURE_SPP);
}
 }
 
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 854e144131c6..8221ecbf6516 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -239,6 +239,11 @@ static inline bool cpu_has_vmx_pml(void)
return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
 }
 
+static inline bool cpu_has_vmx_ept_spp(void)
+{
+   return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_SPP;
+}
+
 static inline bool vmx_xsaves_supported(void)
 {
return vmcs_config.cpu_based_2nd_exec_ctrl &
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 97cfd5a316f3..f94e3defd9cf 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -114,6 +114,8 @@ static u64 __read_mostly host_xss;
 bool __read_mostly enable_pml = 1;
 module_param_named(pml, enable_pml, bool, S_IRUGO);
 
+static bool __read_mostly spp_supported = 0;
+
 #define MSR_BITMAP_MODE_X2APIC 1
 #define MSR_BITMAP_MODE_X2APIC_APICV   2
 
@@ -2247,6 +2249,7 @@ static __init int setup_vmcs_config(struct vmcs_config 
*vmcs_conf,
SECONDARY_EXEC_RDSEED_EXITING |
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_PML |
+   SECONDARY_EXEC_ENABLE_SPP |
SECONDARY_EXEC_TSC_SCALING |
SECONDARY_EXEC_PT_USE_GPA |
SECONDARY_EXEC_PT_CONCEAL_VMX |
@@ -3901,6 +3904,9 @@ static void vmx_compute_secondary_exec_control(struct 
vcpu_vmx *vmx)
if (!enable_pml)
exec_c

[RFC PATCH v6 54/92] kvm: introspection: add KVMI_CONTROL_CR and KVMI_EVENT_CR

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

Using the KVMI_CONTROL_CR command, the introspection tool subscribes to
KVMI_EVENT_CR events that will be sent when CR{0,3,4} is going to
be changed.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 70 ++
 arch/x86/include/asm/kvm_host.h|  2 +
 arch/x86/include/asm/kvmi_host.h   | 16 +
 arch/x86/include/uapi/asm/kvmi.h   | 18 ++
 arch/x86/kvm/kvmi.c| 95 ++
 arch/x86/kvm/svm.c |  5 ++
 arch/x86/kvm/vmx/vmx.c | 14 +
 arch/x86/kvm/x86.c | 19 +-
 virt/kvm/kvmi_int.h|  7 +++
 virt/kvm/kvmi_msg.c| 13 
 10 files changed, 258 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 1eaed7c61148..2e6e285c8e2e 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -1007,6 +1007,41 @@ order to be notified if the expection was not delivered.
 * -KVM_EINVAL - padding is not zero
 * -KVM_EAGAIN - the selected vCPU can't be introspected yet
 
+21. KVMI_CONTROL_CR
+---
+
+:Architectures: x86
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_control_cr {
+   __u8 enable;
+   __u8 padding1;
+   __u16 padding2;
+   __u32 cr;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code
+
+Enables/disables introspection for a specific control register and must
+be used in addition to *KVMI_CONTROL_EVENTS* with the *KVMI_EVENT_CR*
+ID set.
+
+:Errors:
+
+* -KVM_EINVAL - the selected vCPU is invalid
+* -KVM_EINVAL - the specified control register is not part of the CR0, CR3
+   or CR4 set
+* -KVM_EINVAL - padding is not zero
+* -KVM_EAGAIN - the selected vCPU can't be introspected yet
+
 Events
 ==
 
@@ -1238,3 +1273,38 @@ introspection has been enabled for this event (see 
*KVMI_CONTROL_EVENTS*).
 ``kvmi_event``, exception/interrupt number (vector), exception/interrupt
 type, exception code (``error_code``) and CR2 are sent to the introspector.
 
+6. KVMI_EVENT_CR
+
+
+:Architectures: x86
+:Versions: >= 1
+:Actions: CONTINUE, CRASH
+:Parameters:
+
+::
+
+   struct kvmi_event;
+   struct kvmi_event_cr {
+   __u16 cr;
+   __u16 padding[3];
+   __u64 old_value;
+   __u64 new_value;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_event_reply;
+   struct kvmi_event_cr_reply {
+   __u64 new_val;
+   };
+
+This event is sent when a control register is going to be changed and the
+introspection has been enabled for this event and for this specific
+register (see **KVMI_CONTROL_EVENTS**).
+
+``kvmi_event``, the control register number, the old value and the new value
+are sent to the introspector. The *CONTINUE* action will set the ``new_val``.
+
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7ee6e1ff5ee9..22f08f2732cc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1013,6 +1013,7 @@ struct kvm_x86_ops {
bool (*has_emulated_msr)(int index);
void (*cpuid_update)(struct kvm_vcpu *vcpu);
 
+   void (*cr3_write_exiting)(struct kvm_vcpu *vcpu, bool enable);
bool (*nested_pagefault)(struct kvm_vcpu *vcpu);
bool (*spt_fault)(struct kvm_vcpu *vcpu);
 
@@ -1622,5 +1623,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
 
 bool kvm_mmu_nested_pagefault(struct kvm_vcpu *vcpu);
 bool kvm_spt_fault(struct kvm_vcpu *vcpu);
+void kvm_control_cr3_write_exiting(struct kvm_vcpu *vcpu, bool enable);
 
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvmi_host.h b/arch/x86/include/asm/kvmi_host.h
index 7ab6dd71a0c2..83a098dc8939 100644
--- a/arch/x86/include/asm/kvmi_host.h
+++ b/arch/x86/include/asm/kvmi_host.h
@@ -9,4 +9,20 @@ struct kvmi_arch_mem_access {
unsigned long 
active[KVM_PAGE_TRACK_MAX][BITS_TO_LONGS(KVM_MEM_SLOTS_NUM)];
 };
 
+#ifdef CONFIG_KVM_INTROSPECTION
+
+bool kvmi_cr_event(struct kvm_vcpu *vcpu, unsigned int cr,
+  unsigned long old_value, unsigned long *new_value);
+
+#else /* CONFIG_KVM_INTROSPECTION */
+
+static inline bool kvmi_cr_event(struct kvm_vcpu *vcpu, unsigned int cr,
+unsigned long old_value,
+unsigned long *new_value)
+{
+   return true;
+}
+
+#endif /* CONFIG_KVM_INTROSPECTION */
+
 #endif /* _ASM_X86_KVMI_HOST_H */
diff --git a/arch/x86/include/uapi/asm/kvmi.h b/arch/x86/include/uapi/asm/kvmi.h
index b074ad735e84..c983b4bd2c72 100644
--- a/arch/x86/include/uapi/asm/kvmi.h
+++ b/arch/x86/include/uapi/asm/kvmi.h
@@ -61,4 +61,22 @@ struct kvmi_get_cpuid_reply {
__u32 edx;
 };
 
+struct kvmi_control_cr {
+   __u8

[RFC PATCH v6 71/92] mm: add support for remote mapping

2019-08-09 Thread Adalbert Lazăr

From: Mircea Cîrjaliu 

The following two new mm exports are introduced:
 * mm_remote_map(struct mm_struct *req_mm,
 unsigned long req_hva,
 unsigned long map_hva)
 * mm_remote_unmap(unsigned long map_hva)
 * mm_remote_reset(void)
 * rmap_walk_remote(struct page *page,
struct rmap_walk_control *rwc)

This patch allows one process to map into its address space a page from
another process. The previous page (if it exists) is dropped. There is
no corresponding pair of system calls as this API is meant to be used
by the kernel itself only.

The targeted user is the upcoming KVM VM introspection subsystem (KVMI),
where an introspector running in its own VM will map pages from the
introspected guest in order to eliminate round trips to the host kernel
(read/write guest pages).

The flow is as follows: the introspector identifies a guest physical
address where some information of interest is located. It creates a one
page anonymous mapping with MAP_LOCKED | MAP_POPULATE and calls the
kernel via an IOCTL on /dev/kvmmem giving the map virtual address and
the guest physical address as arguments. The kernel converts the map
va into a physical page (gpa in KVM-speak) and passes it to the host
kernel via a hypercall, along with the introspected guest gpa. The host
kernel converts the two gpa-s into their appropriate hva-s (host virtual
addresses) and makes sure the vma backing up the page belonging to the VM
in which the introspector runs, points to the indicated page into the
introspected guest. I have not included here the use of the mapping
token described in the KVMI documentation.

Signed-off-by: Mircea Cîrjaliu 
Signed-off-by: Adalbert Lazăr 
---
 include/linux/page-flags.h  |9 +-
 include/linux/remote_mapping.h  |  167 +++
 include/uapi/linux/remote_mapping.h |   18 +
 mm/Kconfig  |8 +
 mm/Makefile |1 +
 mm/memory-failure.c |   69 +-
 mm/migrate.c|9 +-
 mm/remote_mapping.c | 1834 +++
 mm/rmap.c   |   13 +-
 mm/vmscan.c |3 +-
 10 files changed, 2108 insertions(+), 23 deletions(-)
 create mode 100644 include/linux/remote_mapping.h
 create mode 100644 include/uapi/linux/remote_mapping.h
 create mode 100644 mm/remote_mapping.c

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 39b4494e29f1..3f65b2833562 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -417,8 +417,10 @@ PAGEFLAG(Idle, idle, PF_ANY)
  */
 #define PAGE_MAPPING_ANON  0x1
 #define PAGE_MAPPING_MOVABLE   0x2
+#define PAGE_MAPPING_REMOTE0x4
 #define PAGE_MAPPING_KSM   (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
-#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
+#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE | \
+PAGE_MAPPING_REMOTE)
 
 static __always_inline int PageMappingFlags(struct page *page)
 {
@@ -431,6 +433,11 @@ static __always_inline int PageAnon(struct page *page)
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
 }
 
+static __always_inline int PageRemote(struct page *page)
+{
+   return ((unsigned long)page->mapping & PAGE_MAPPING_REMOTE) != 0;
+}
+
 static __always_inline int __PageMovable(struct page *page)
 {
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
diff --git a/include/linux/remote_mapping.h b/include/linux/remote_mapping.h
new file mode 100644
index ..d30d0d10e51d
--- /dev/null
+++ b/include/linux/remote_mapping.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _REMOTE_MAPPING_H
+#define _REMOTE_MAPPING_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct page_db {
+   struct mm_struct *target;   /* target for this mapping */
+   unsigned long req_hva;  /* HVA in target */
+   unsigned long map_hva;  /* HVA in client */
+
+   refcount_t refcnt;  /* client-side sharing */
+   int flags;
+
+   /* target links - serialized by target_db->lock */
+   struct list_head target_link;   /* target-side link */
+
+   /* client links - serialized by client_db->lock */
+   struct rb_node file_link;   /* uses map_hva as key */
+
+   /* rmap components - serialized by page lock */
+   struct anon_vma *req_anon_vma;
+   struct anon_vma *map_anon_vma;
+};
+
+struct target_db {
+   struct mm_struct *mm;   /* mm of this struct */
+   struct hlist_node db_link;  /* database link */
+
+   struct mmu_notifier mn; /* for notifications from mm */
+   struct rcu_head rcu;/* for delayed freeing */
+   refcount_t refcnt;
+
+   spinlock_t l

[RFC PATCH v6 15/92] kvm: introspection: handle vCPU related introspection commands

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

Following the common structure used for all messages (kvmi_msg_hdr), all
vCPU related commands have another common structure (kvmi_vcpu_hdr). This
allows the receiving worker to validate and dispatch the message to the
proper vCPU (adding the handling function to its jobs list).

Signed-off-by: Mihai Donțu 
Co-developed-by: Nicușor Cîțu 
Signed-off-by: Nicușor Cîțu 
Co-developed-by: Adalbert Lazăr 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst |   8 ++
 include/uapi/linux/kvm_para.h  |   4 +-
 include/uapi/linux/kvmi.h  |   6 ++
 virt/kvm/kvmi_int.h|   3 +
 virt/kvm/kvmi_msg.c| 159 -
 5 files changed, 177 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index a660def20b23..7f3c4f8fce63 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -232,6 +232,14 @@ The following C structures are meant to be used directly 
when communicating
 over the wire. The peer that detects any size mismatch should simply close
 the connection and report the error.
 
+The commands related to vCPU-s start with::
+
+   struct kvmi_vcpu_hdr {
+   __u16 vcpu;
+   __u16 padding1;
+   __u32 padding2;
+   }
+
 1. KVMI_GET_VERSION
 ---
 
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index 6c0ce49931e5..54c0e20f5b64 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -10,13 +10,15 @@
  * - kvm_para_available
  */
 
-/* Return values for hypercalls */
+/* Return values for hypercalls and VM introspection */
 #define KVM_ENOSYS 1000
 #define KVM_EFAULT EFAULT
 #define KVM_EINVAL EINVAL
 #define KVM_E2BIG  E2BIG
 #define KVM_EPERM  EPERM
 #define KVM_EOPNOTSUPP 95
+#define KVM_EAGAIN 11
+#define KVM_ENOMEM ENOMEM
 
 #define KVM_HC_VAPIC_POLL_IRQ  1
 #define KVM_HC_MMU_OP  2
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index ff35faabb7ed..29452da818e3 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -114,4 +114,10 @@ struct kvmi_control_vm_events {
__u32 padding2;
 };
 
+struct kvmi_vcpu_hdr {
+   __u16 vcpu;
+   __u16 padding1;
+   __u32 padding2;
+};
+
 #endif /* _UAPI__LINUX_KVMI_H */
diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h
index 47418e9a86f6..33ea05cb99af 100644
--- a/virt/kvm/kvmi_int.h
+++ b/virt/kvm/kvmi_int.h
@@ -118,5 +118,8 @@ void *kvmi_msg_alloc_check(size_t size);
 void kvmi_msg_free(void *addr);
 int kvmi_cmd_control_vm_events(struct kvmi *ikvm, unsigned int event_id,
   bool enable);
+int kvmi_add_job(struct kvm_vcpu *vcpu,
+void (*fct)(struct kvm_vcpu *vcpu, void *ctx),
+void *ctx, void (*free_fct)(void *ctx));
 
 #endif
diff --git a/virt/kvm/kvmi_msg.c b/virt/kvm/kvmi_msg.c
index a55c9e35be36..2728e6870d47 100644
--- a/virt/kvm/kvmi_msg.c
+++ b/virt/kvm/kvmi_msg.c
@@ -8,6 +8,18 @@
 #include 
 #include "kvmi_int.h"
 
+typedef int (*vcpu_reply_fct)(struct kvm_vcpu *vcpu,
+ const struct kvmi_msg_hdr *msg, int err,
+ const void *rpl, size_t rpl_size);
+
+struct kvmi_vcpu_cmd {
+   vcpu_reply_fct reply_cb;
+   struct {
+   struct kvmi_msg_hdr hdr;
+   struct kvmi_vcpu_hdr cmd;
+   } *msg;
+};
+
 static const char *const msg_IDs[] = {
[KVMI_CHECK_COMMAND] = "KVMI_CHECK_COMMAND",
[KVMI_CHECK_EVENT]   = "KVMI_CHECK_EVENT",
@@ -165,6 +177,23 @@ static int kvmi_msg_vm_maybe_reply(struct kvmi *ikvm,
return kvmi_msg_vm_reply(ikvm, msg, err, rpl, rpl_size);
 }
 
+int kvmi_msg_vcpu_reply(struct kvm_vcpu *vcpu,
+   const struct kvmi_msg_hdr *msg, int err,
+   const void *rpl, size_t rpl_size)
+{
+   return kvmi_msg_reply(IKVM(vcpu->kvm), msg, err, rpl, rpl_size);
+}
+
+int kvmi_msg_vcpu_drop_reply(struct kvm_vcpu *vcpu,
+ const struct kvmi_msg_hdr *msg, int err,
+ const void *rpl, size_t rpl_size)
+{
+   if (!kvmi_validate_no_reply(IKVM(vcpu->kvm), msg, rpl_size, err))
+   return -KVM_EINVAL;
+
+   return 0;
+}
+
 static int handle_get_version(struct kvmi *ikvm,
  const struct kvmi_msg_hdr *msg, const void *req)
 {
@@ -248,6 +277,23 @@ static int handle_control_vm_events(struct kvmi *ikvm,
return kvmi_msg_vm_maybe_reply(ikvm, msg, ec, NULL, 0);
 }
 
+static int kvmi_get_vcpu(struct kvmi *ikvm, unsigned int vcpu_idx,
+struct kvm_vcpu **dest)
+{
+   struct kvm *kvm = ikvm->kvm;
+   struct

[RFC PATCH v6 44/92] kvm: introspection: extend the internal database of tracked pages with write_bitmap info

2019-08-09 Thread Adalbert Lazăr

This will allow us to use the subpage protection feature.

Signed-off-by: Adalbert Lazăr 
---
 virt/kvm/kvmi.c | 46 +
 virt/kvm/kvmi_int.h |  1 +
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index 4a9a4430a460..e18dfffa25ac 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -32,6 +32,7 @@ static void kvmi_track_flush_slot(struct kvm *kvm, struct 
kvm_memory_slot *slot,
 static const u8 full_access  = KVMI_PAGE_ACCESS_R |
KVMI_PAGE_ACCESS_W |
KVMI_PAGE_ACCESS_X;
+static const u32 default_write_access_bitmap;
 
 void *kvmi_msg_alloc(void)
 {
@@ -57,23 +58,32 @@ static struct kvmi_mem_access *__kvmi_get_gfn_access(struct 
kvmi *ikvm,
return radix_tree_lookup(>access_tree, gfn);
 }
 
+/*
+ * TODO: intercept any SPP change made on pages present in our radix tree.
+ *
+ * bitmap must have the same value as the corresponding SPPT entry.
+ */
 static int kvmi_get_gfn_access(struct kvmi *ikvm, const gfn_t gfn,
-  u8 *access)
+  u8 *access, u32 *write_bitmap)
 {
struct kvmi_mem_access *m;
 
+   *write_bitmap = default_write_access_bitmap;
*access = full_access;
 
read_lock(>access_tree_lock);
m = __kvmi_get_gfn_access(ikvm, gfn);
-   if (m)
+   if (m) {
*access = m->access;
+   *write_bitmap = m->write_bitmap;
+   }
read_unlock(>access_tree_lock);
 
return m ? 0 : -1;
 }
 
-static int kvmi_set_gfn_access(struct kvm *kvm, gfn_t gfn, u8 access)
+static int kvmi_set_gfn_access(struct kvm *kvm, gfn_t gfn, u8 access,
+  u32 write_bitmap)
 {
struct kvmi_mem_access *m;
struct kvmi_mem_access *__m;
@@ -87,6 +97,7 @@ static int kvmi_set_gfn_access(struct kvm *kvm, gfn_t gfn, u8 
access)
 
m->gfn = gfn;
m->access = access;
+   m->write_bitmap = write_bitmap;
 
if (radix_tree_preload(GFP_KERNEL)) {
err = -KVM_ENOMEM;
@@ -100,6 +111,7 @@ static int kvmi_set_gfn_access(struct kvm *kvm, gfn_t gfn, 
u8 access)
__m = __kvmi_get_gfn_access(ikvm, gfn);
if (__m) {
__m->access = access;
+   __m->write_bitmap = write_bitmap;
kvmi_arch_update_page_tracking(kvm, NULL, __m);
if (access == full_access) {
radix_tree_delete(>access_tree, gfn);
@@ -124,12 +136,22 @@ static int kvmi_set_gfn_access(struct kvm *kvm, gfn_t 
gfn, u8 access)
return err;
 }
 
+static bool spp_access_allowed(gpa_t gpa, unsigned long bitmap)
+{
+   u32 off = (gpa & ~PAGE_MASK);
+   u32 spp = off / 128;
+
+   return test_bit(spp, );
+}
+
 static bool kvmi_restricted_access(struct kvmi *ikvm, gpa_t gpa, u8 access)
 {
+   u32 allowed_bitmap;
u8 allowed_access;
int err;
 
-   err = kvmi_get_gfn_access(ikvm, gpa_to_gfn(gpa), _access);
+   err = kvmi_get_gfn_access(ikvm, gpa_to_gfn(gpa), _access,
+ _bitmap);
 
if (err)
return false;
@@ -138,8 +160,14 @@ static bool kvmi_restricted_access(struct kvmi *ikvm, 
gpa_t gpa, u8 access)
 * We want to be notified only for violations involving access
 * bits that we've specifically cleared
 */
-   if ((~allowed_access) & access)
+   if ((~allowed_access) & access) {
+   bool write_access = (access & KVMI_PAGE_ACCESS_W);
+
+   if (write_access && spp_access_allowed(gpa, allowed_bitmap))
+   return false;
+
return true;
+   }
 
return false;
 }
@@ -1126,8 +1154,9 @@ void kvmi_handle_requests(struct kvm_vcpu *vcpu)
 int kvmi_cmd_get_page_access(struct kvmi *ikvm, u64 gpa, u8 *access)
 {
gfn_t gfn = gpa_to_gfn(gpa);
+   u32 ignored_write_bitmap;
 
-   kvmi_get_gfn_access(ikvm, gfn, access);
+   kvmi_get_gfn_access(ikvm, gfn, access, _write_bitmap);
 
return 0;
 }
@@ -1136,10 +1165,11 @@ int kvmi_cmd_set_page_access(struct kvmi *ikvm, u64 
gpa, u8 access)
 {
gfn_t gfn = gpa_to_gfn(gpa);
u8 ignored_access;
+   u32 write_bitmap;
 
-   kvmi_get_gfn_access(ikvm, gfn, _access);
+   kvmi_get_gfn_access(ikvm, gfn, _access, _bitmap);
 
-   return kvmi_set_gfn_access(ikvm->kvm, gfn, access);
+   return kvmi_set_gfn_access(ikvm->kvm, gfn, access, write_bitmap);
 }
 
 int kvmi_cmd_control_events(struct kvm_vcpu *vcpu, unsigned int event_id,
diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h
index 3f0c7a03b4a1..d9a10a3b7082 100644
--- a/virt/kvm/kvmi_int.h
+++ b/virt/kvm/kvmi_int.h
@@ -141,6 +141,7 @@ struct kvmi {
 struct kvmi_mem_access {
gfn_t gfn;
u8 access;
+

[RFC PATCH v6 74/92] kvm: x86: do not unconditionally patch the hypercall instruction during emulation

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

It can happened for us to end up emulating the VMCALL instruction as a
result of the handling of an EPT write fault. In this situation, the
emulator will try to unconditionally patch the correct hypercall opcode
bytes using emulator_write_emulated(). However, this last call uses the
fault GPA (if available) or walks the guest page tables at RIP,
otherwise. The trouble begins when using KVMI, when we forbid the use of
the fault GPA and fallback to the guest pt walk: in Windows (8.1 and
newer) the page that we try to write into is marked read-execute and as
such emulator_write_emulated() fails and we inject a write #PF, leading
to a guest crash.

The fix is rather simple: check the existing instruction bytes before
doing the patching. This does not change the normal KVM behaviour, but
does help when using KVMI as we no longer inject a write #PF.

CC: Joerg Roedel 
Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/kvm/x86.c | 23 ---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 04b1d2916a0a..965c4f0108eb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7363,16 +7363,33 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
 
+#define KVM_HYPERCALL_INSN_LEN 3
+
 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 {
+   int err;
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-   char instruction[3];
+   char buf[KVM_HYPERCALL_INSN_LEN];
+   char instruction[KVM_HYPERCALL_INSN_LEN];
unsigned long rip = kvm_rip_read(vcpu);
 
+   err = emulator_read_emulated(ctxt, rip, buf, sizeof(buf),
+>exception);
+   if (err != X86EMUL_CONTINUE)
+   return err;
+
kvm_x86_ops->patch_hypercall(vcpu, instruction);
+   if (!memcmp(instruction, buf, sizeof(instruction)))
+   /*
+* The hypercall instruction is the correct one. Retry
+* its execution maybe we got here as a result of an
+* event other than #UD which has been resolved in the
+* mean time.
+*/
+   return X86EMUL_CONTINUE;
 
-   return emulator_write_emulated(ctxt, rip, instruction, 3,
-   >exception);
+   return emulator_write_emulated(ctxt, rip, instruction,
+  sizeof(instruction), >exception);
 }
 
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[RFC PATCH v6 31/92] kvm: introspection: add KVMI_EVENT_PF

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

This event is sent when a #PF occurs due to a failed permission check
in the shadow page tables, for a page in which the introspection tool
has shown interest.

The introspection tool can respond to a KVMI_EVENT_PF event with custom
input for the current instruction. This input is used to trick the guest
software into believing it has read certain data, in order to hide the
content of certain memory areas (eg. hide injected code from integrity
checkers).

Signed-off-by: Mihai Donțu 
Co-developed-by: Nicușor Cîțu 
Signed-off-by: Nicușor Cîțu 
Co-developed-by: Adalbert Lazăr 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst |  63 ++
 arch/x86/kvm/kvmi.c|  38 ++-
 arch/x86/kvm/x86.c |   7 +-
 include/linux/kvmi.h   |   4 ++
 include/uapi/linux/kvmi.h  |  18 +
 virt/kvm/kvmi.c| 103 +
 virt/kvm/kvmi_int.h|  13 
 virt/kvm/kvmi_msg.c|  55 +++
 8 files changed, 298 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 957641802cac..0fc51b57b1e8 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -618,3 +618,66 @@ The introspection tool has a chance to unhook and close 
the KVMI channel
 This event is sent when a new vCPU is created and the introspection has
 been enabled for this event (see *KVMI_CONTROL_VM_EVENTS*).
 
+3. KVMI_EVENT_PF
+
+
+:Architectures: x86
+:Versions: >= 1
+:Actions: CONTINUE, CRASH, RETRY
+:Parameters:
+
+::
+
+   struct kvmi_event;
+   struct kvmi_event_pf {
+   __u64 gva;
+   __u64 gpa;
+   __u8 access;
+   __u8 padding1;
+   __u16 view;
+   __u32 padding2;
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_event_reply;
+   struct kvmi_event_pf_reply {
+   __u64 ctx_addr;
+   __u32 ctx_size;
+   __u8 singlestep;
+   __u8 rep_complete;
+   __u16 padding;
+   __u8 ctx_data[256];
+   };
+
+This event is sent when a hypervisor page fault occurs due to a failed
+permission check in the shadow page tables, the introspection has
+been enabled for this event (see *KVMI_CONTROL_EVENTS*) and the event was
+generated for a page in which the introspector has shown interest
+(ie. has previously touched it by adjusting the spte permissions).
+
+The shadow page tables can be used by the introspection tool to guarantee
+the purpose of code areas inside the guest (code, rodata, stack, heap
+etc.) Each attempt at an operation unfitting for a certain memory
+range (eg. execute code in heap) triggers a page fault and gives the
+introspection tool the chance to audit the code attempting the operation.
+
+``kvmi_event``, guest virtual address (or 0x/UNMAPPED_GVA),
+guest physical address, access flags (eg. KVMI_PAGE_ACCESS_R) and the
+EPT view are sent to the introspector.
+
+The *CONTINUE* action will continue the page fault handling via emulation
+(with custom input if ``ctx_size`` > 0). The use of custom input is
+to trick the guest software into believing it has read certain data,
+in order to hide the content of certain memory areas (eg. hide injected
+code from integrity checkers). If ``rep_complete`` is not zero, the REP
+prefixed instruction should be emulated just once (or at least no other
+*KVMI_EVENT_PF* event should be sent for the current instruction).
+
+The *RETRY* action is used by the introspector to retry the execution of
+the current instruction. Either using single-step (if ``singlestep`` is
+not zero) or return to guest (if the introspector changed the instruction
+pointer or the page restrictions).
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index d7b9201582b4..121819f9c487 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -94,7 +94,43 @@ void kvmi_arch_setup_event(struct kvm_vcpu *vcpu, struct 
kvmi_event *ev)
 bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
u8 access)
 {
-   return KVMI_EVENT_ACTION_CONTINUE; /* TODO */
+   struct kvmi_vcpu *ivcpu;
+   u32 ctx_size;
+   u64 ctx_addr;
+   u32 action;
+   bool singlestep_ignored;
+   bool ret = false;
+
+   if (!kvm_spt_fault(vcpu))
+   /* We are only interested in EPT/NPT violations */
+   return true;
+
+   ivcpu = IVCPU(vcpu);
+   ctx_size = sizeof(ivcpu->ctx_data);
+
+   if (ivcpu->effective_rep_complete)
+   return true;
+
+   action = kvmi_msg_send_pf(vcpu, gpa, gva, access, _ignored,
+ >rep_complete, _addr,
+ ivcpu->ctx_data, _size);
+
+   ivcpu->ctx_size = 0;
+

[RFC PATCH v6 27/92] kvm: introspection: use page track

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

From preread, prewrite and preexec callbacks we will send the
KVMI_EVENT_PF events caused by access rights enforced by the introspection
tool.

Signed-off-by: Mihai Donțu 
Co-developed-by: Nicușor Cîțu 
Signed-off-by: Nicușor Cîțu 
Co-developed-by: Marian Rotariu 
Signed-off-by: Marian Rotariu 
Co-developed-by: Adalbert Lazăr 
Signed-off-by: Adalbert Lazăr 
---
 arch/x86/include/asm/kvmi_host.h |  12 ++
 arch/x86/kvm/kvmi.c  |  45 +
 include/uapi/linux/kvmi.h|   4 +
 virt/kvm/kvmi.c  | 293 ++-
 virt/kvm/kvmi_int.h  |  21 +++
 5 files changed, 374 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/kvmi_host.h

diff --git a/arch/x86/include/asm/kvmi_host.h b/arch/x86/include/asm/kvmi_host.h
new file mode 100644
index ..7ab6dd71a0c2
--- /dev/null
+++ b/arch/x86/include/asm/kvmi_host.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KVMI_HOST_H
+#define _ASM_X86_KVMI_HOST_H
+
+#include 
+#include 
+
+struct kvmi_arch_mem_access {
+   unsigned long 
active[KVM_PAGE_TRACK_MAX][BITS_TO_LONGS(KVM_MEM_SLOTS_NUM)];
+};
+
+#endif /* _ASM_X86_KVMI_HOST_H */
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 97c72cdc6fb0..d7b9201582b4 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -91,6 +91,12 @@ void kvmi_arch_setup_event(struct kvm_vcpu *vcpu, struct 
kvmi_event *ev)
kvmi_get_msrs(vcpu, event);
 }
 
+bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
+   u8 access)
+{
+   return KVMI_EVENT_ACTION_CONTINUE; /* TODO */
+}
+
 int kvmi_arch_cmd_get_vcpu_info(struct kvm_vcpu *vcpu,
struct kvmi_get_vcpu_info_reply *rpl)
 {
@@ -102,3 +108,42 @@ int kvmi_arch_cmd_get_vcpu_info(struct kvm_vcpu *vcpu,
return 0;
 }
 
+static const struct {
+   unsigned int allow_bit;
+   enum kvm_page_track_mode track_mode;
+} track_modes[] = {
+   { KVMI_PAGE_ACCESS_R, KVM_PAGE_TRACK_PREREAD },
+   { KVMI_PAGE_ACCESS_W, KVM_PAGE_TRACK_PREWRITE },
+   { KVMI_PAGE_ACCESS_X, KVM_PAGE_TRACK_PREEXEC },
+};
+
+void kvmi_arch_update_page_tracking(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   struct kvmi_mem_access *m)
+{
+   struct kvmi_arch_mem_access *arch = >arch;
+   int i;
+
+   if (!slot) {
+   slot = gfn_to_memslot(kvm, m->gfn);
+   if (!slot)
+   return;
+   }
+
+   for (i = 0; i < ARRAY_SIZE(track_modes); i++) {
+   unsigned int allow_bit = track_modes[i].allow_bit;
+   enum kvm_page_track_mode mode = track_modes[i].track_mode;
+   bool slot_tracked = test_bit(slot->id, arch->active[mode]);
+
+   if (m->access & allow_bit) {
+   if (slot_tracked) {
+   kvm_slot_page_track_remove_page(kvm, slot,
+   m->gfn, mode);
+   clear_bit(slot->id, arch->active[mode]);
+   }
+   } else if (!slot_tracked) {
+   kvm_slot_page_track_add_page(kvm, slot, m->gfn, mode);
+   set_bit(slot->id, arch->active[mode]);
+   }
+   }
+}
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index aa5bc909e278..c56e676ddb2b 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -70,6 +70,10 @@ enum {
 #define KVMI_EVENT_ACTION_RETRY 1
 #define KVMI_EVENT_ACTION_CRASH 2
 
+#define KVMI_PAGE_ACCESS_R (1 << 0)
+#define KVMI_PAGE_ACCESS_W (1 << 1)
+#define KVMI_PAGE_ACCESS_X (1 << 2)
+
 #define KVMI_MSG_SIZE (4096 - sizeof(struct kvmi_msg_hdr))
 
 struct kvmi_msg_hdr {
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index d0d9adf5b6ed..5cbc82b284f4 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -11,10 +11,27 @@
 #include 
 
 static struct kmem_cache *msg_cache;
+static struct kmem_cache *radix_cache;
 static struct kmem_cache *job_cache;
 
 static bool kvmi_create_vcpu_event(struct kvm_vcpu *vcpu);
 static void kvmi_abort_events(struct kvm *kvm);
+static bool kvmi_track_preread(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
+   u8 *new, int bytes, struct kvm_page_track_notifier_node *node,
+   bool *data_ready);
+static bool kvmi_track_prewrite(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
+   const u8 *new, int bytes, struct kvm_page_track_notifier_node *node);
+static bool kvmi_track_preexec(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva,
+   struct kvm_page_track_notifier_node *node);
+static void kvmi_track_create_slot(struct kvm *kvm,
+   struct kvm_memory_slot *slot, unsigned long npages,
+   struct kvm_page_track_notifier_node *node);

[RFC PATCH v6 03/92] kvm: introspection: add permission access ioctls

2019-08-09 Thread Adalbert Lazăr

KVM_INTROSPECTION_COMMAND and KVM_INTROSPECTION_EVENTS should be used
by userspace/QEMU to allow access to specific (or all) introspection
commands and events.

By default, all introspection events and almost all introspection commands
are disallowed. There are a couple of commands that are always allowed
(those querying the introspection capabilities).

Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/api.txt | 56 +++-
 include/uapi/linux/kvm.h  |  6 +++
 virt/kvm/kvm_main.c   |  6 +++
 virt/kvm/kvmi.c   | 85 +++
 virt/kvm/kvmi_int.h   | 51 +++
 5 files changed, 203 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 28d4429f9ae9..ea3135d365c7 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3889,7 +3889,61 @@ It will fail with -EINVAL if padding is not zero.
 The KVMI version can be retrieved using the KVM_CAP_INTROSPECTION of
 the KVM_CHECK_EXTENSION ioctl() at run-time.
 
-4.997 KVM_INTROSPECTION_UNHOOK
+4.997 KVM_INTROSPECTION_COMMAND
+
+Capability: KVM_CAP_INTROSPECTION
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_introspection_feature (in)
+Returns: 0 on success, a negative value on error
+
+This ioctl is used to allow or disallow introspection commands
+for the current VM. By default, almost all commands are disallowed
+except for those used to query the API.
+
+struct kvm_introspection_feature {
+   __u32 allow;
+   __s32 id;
+};
+
+If allow is 1, the command specified by id is allowed. If allow is 0,
+the command is disallowed.
+
+Unless set to -1 (meaning all commands), id must be a command ID
+(e.g. KVMI_GET_VERSION, KVMI_GET_GUEST_INFO etc.)
+
+Errors:
+
+  -EINVAL if the command is unknown
+  -EPERM if the command can't be disallowed (e.g. KVMI_GET_VERSION)
+
+4.998 KVM_INTROSPECTION_EVENT
+
+Capability: KVM_CAP_INTROSPECTION
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_introspection_feature (in)
+Returns: 0 on success, a negative value on error
+
+This ioctl is used to allow or disallow introspection events
+for the current VM. By default, all events are disallowed.
+
+struct kvm_introspection_feature {
+   __u32 allow;
+   __s32 id;
+};
+
+If allow is 1, the event specified by id is allowed. If allow is 0,
+the event is disallowed.
+
+Unless set to -1 (meaning all event), id must be a event ID
+(e.g. KVMI_EVENT_UNHOOK, KVMI_EVENT_CR, etc.)
+
+Errors:
+
+  -EINVAL if the event is unknown
+
+4.999 KVM_INTROSPECTION_UNHOOK
 
 Capability: KVM_CAP_INTROSPECTION
 Architectures: x86
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index bae37bf37338..2ff05fd123e3 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1527,9 +1527,15 @@ struct kvm_introspection {
__u32 padding;
__u8 uuid[16];
 };
+struct kvm_introspection_feature {
+   __u32 allow;
+   __s32 id;
+};
 #define KVM_INTROSPECTION_HOOK_IOW(KVMIO, 0xff, struct kvm_introspection)
 #define KVM_INTROSPECTION_UNHOOK  _IO(KVMIO, 0xfe)
 /* write true on force-reset, false otherwise */
+#define KVM_INTROSPECTION_COMMAND _IOW(KVMIO, 0xfd, struct 
kvm_introspection_feature)
+#define KVM_INTROSPECTION_EVENT   _IOW(KVMIO, 0xfc, struct 
kvm_introspection_feature)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 09a930ac007d..8399b826f2d2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3270,6 +3270,12 @@ static long kvm_vm_ioctl(struct file *filp,
case KVM_INTROSPECTION_HOOK:
r = kvmi_ioctl_hook(kvm, argp);
break;
+   case KVM_INTROSPECTION_COMMAND:
+   r = kvmi_ioctl_command(kvm, argp);
+   break;
+   case KVM_INTROSPECTION_EVENT:
+   r = kvmi_ioctl_event(kvm, argp);
+   break;
case KVM_INTROSPECTION_UNHOOK:
r = kvmi_ioctl_unhook(kvm, arg);
break;
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index 591f6ee22135..dc64f975998f 100644
--- a/virt/kvm/kvmi.c
+++ b/virt/kvm/kvmi.c
@@ -169,6 +169,91 @@ int kvmi_ioctl_hook(struct kvm *kvm, void __user *argp)
return kvmi_hook(kvm, );
 }
 
+static int kvmi_ioctl_get_feature(void __user *argp, bool *allow, int *id,
+ unsigned long *bitmask)
+{
+   struct kvm_introspection_feature feat;
+   int all_bits = -1;
+
+   if (copy_from_user(, argp, sizeof(feat)))
+   return -EFAULT;
+
+   if (feat.id < 0 && feat.id != all_bits)
+   return -EINVAL;
+
+   *allow = !!(feat.allow & 1);
+   *id = feat.id;
+   *bitmask = *id == all_bits ? -1 : BIT(feat.id);
+
+   return 0;
+}
+
+static int kvmi_io

[RFC PATCH v6 16/92] kvm: introspection: handle events and event replies

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

All events are sent by the vCPU thread, which will handle any
introspection command while waiting for the reply.

The event reply messages contain a common strucure (kvmi_vcpu_hdr), as
any vCPU related command, which allows the receiving worker to dispatch
the reply as it does with any other introspection command sent for a
specific vCPU.

The kernel side will gracefully handle commands coming from an
introspection tool compiled with older or newer versions of KVMI API.
However, it will only accept smaller replies (coming from older versions),
but not the bigger/newer ones (this should make the kernel code simpler).

TODO: Not quite true. An event reply has a common part (kvmi_event_reply)
and an event specific part (eg. the new value for MSR x). If the common
part is smaller, the event will be rejected.

The code from handle_event_reply():

common = sizeof(struct kvmi_vcpu_hdr) + sizeof(*reply);
if (unlikely(msg->size < common))
goto out;

should be changed to

min_common = sizeof(struct kvmi_vcpu_hdr) + offsetof(reply...)
if (unlikely(msg->size < min_common))
goto out;

Signed-off-by: Mihai Donțu 
Co-developed-by: Adalbert Lazăr 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst |  56 +
 arch/x86/include/uapi/asm/kvmi.h   |  29 +++
 arch/x86/kvm/Makefile  |   2 +-
 arch/x86/kvm/kvmi.c|  92 
 arch/x86/kvm/x86.c |  10 +++
 include/linux/kvm_host.h   |   3 +
 include/uapi/linux/kvmi.h  |  16 
 virt/kvm/kvmi.c|  15 
 virt/kvm/kvmi_int.h|  16 
 virt/kvm/kvmi_msg.c| 129 +
 10 files changed, 367 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/uapi/asm/kvmi.h
 create mode 100644 arch/x86/kvm/kvmi.c

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 7f3c4f8fce63..e7d9a3816e00 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -427,3 +427,59 @@ in almost all cases, it must reply with: continue, retry, 
crash, etc.
 * -KVM_EINVAL - padding is not zero
 * -KVM_EPERM - the access is restricted by the host
 
+Events
+==
+
+All vCPU events are sent using the *KVMI_EVENT* message id. No event
+will be sent unless explicitly enabled with a *KVMI_CONTROL_EVENTS*
+or a *KVMI_CONTROL_VM_EVENTS* command or requested, as it is the case
+with the *KVMI_EVENT_PAUSE_VCPU* event (see **KVMI_PAUSE_VCPU**).
+
+There is one VM event, *KVMI_EVENT_UNHOOK*, which doesn't have a reply,
+but shares the kvmi_event structure, for consistency with the vCPU events.
+
+The message data begins with a common structure, having the size of the
+structure, the vCPU index and the event id::
+
+   struct kvmi_event {
+   __u16 size;
+   __u16 vcpu;
+   __u8 event;
+   __u8 padding[3];
+   struct kvmi_event_arch arch;
+   }
+
+On x86 the structure looks like this::
+
+   struct kvmi_event_arch {
+   __u8 mode;
+   __u8 padding[7];
+   struct kvm_regs regs;
+   struct kvm_sregs sregs;
+   struct {
+   __u64 sysenter_cs;
+   __u64 sysenter_esp;
+   __u64 sysenter_eip;
+   __u64 efer;
+   __u64 star;
+   __u64 lstar;
+   __u64 cstar;
+   __u64 pat;
+   __u64 shadow_gs;
+   } msrs;
+   };
+
+It contains information about the vCPU state at the time of the event.
+
+The reply to events have the *KVMI_EVENT_REPLY* message id and begins
+with two common structures::
+
+   struct kvmi_vcpu_hdr;
+   struct kvmi_event_reply {
+   __u8 action;
+   __u8 event;
+   __u16 padding1;
+   __u32 padding2;
+   };
+
+Specific data can follow these common structures.
diff --git a/arch/x86/include/uapi/asm/kvmi.h b/arch/x86/include/uapi/asm/kvmi.h
new file mode 100644
index ..551f9ed1ed9c
--- /dev/null
+++ b/arch/x86/include/uapi/asm/kvmi.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_KVMI_H
+#define _UAPI_ASM_X86_KVMI_H
+
+/*
+ * KVM introspection - x86 specific structures and definitions
+ */
+
+#include 
+
+struct kvmi_event_arch {
+   __u8 mode;  /* 2, 4 or 8 */
+   __u8 padding[7];
+   struct kvm_regs regs;
+   struct kvm_sregs sregs;
+   struct {
+   __u64 sysenter_cs;
+   __u64 sysenter_esp;
+   __u64 sysenter_eip;
+   __u64 efer;
+   __u64 star;
+   __u64 lstar;
+   __u64 cstar;
+

[RFC PATCH v6 32/92] kvm: introspection: add KVMI_GET_PAGE_ACCESS

2019-08-09 Thread Adalbert Lazăr

From: Mihai Donțu 

Returns the spte access bits (rwx) for an array of guest physical
addresses.

It does this by checking the radix tree in which only the spte bits
"enforced" by the introspection tool are saved. This information should
already be known by the tool. Not to mention that the KVMI_EVENT_PF
events are sent only for EPT violation caused by these restrictions.
So, we might drop this command.

Signed-off-by: Mihai Donțu 
Signed-off-by: Adalbert Lazăr 
---
 Documentation/virtual/kvm/kvmi.rst | 54 ++
 arch/x86/kvm/kvmi.c| 41 +++
 include/uapi/linux/kvmi.h  | 11 ++
 virt/kvm/kvmi.c|  9 +
 virt/kvm/kvmi_int.h|  6 
 virt/kvm/kvmi_msg.c| 17 ++
 6 files changed, 138 insertions(+)

diff --git a/Documentation/virtual/kvm/kvmi.rst 
b/Documentation/virtual/kvm/kvmi.rst
index 0fc51b57b1e8..c27fea73ccfb 100644
--- a/Documentation/virtual/kvm/kvmi.rst
+++ b/Documentation/virtual/kvm/kvmi.rst
@@ -509,6 +509,60 @@ by the *KVMI_CONTROL_VM_EVENTS* command.
 * -KVM_EPERM - the access is restricted by the host
 * -KVM_EOPNOTSUPP - one the events can't be intercepted in the current setup
 
+9. KVMI_GET_PAGE_ACCESS
+---
+
+:Architectures: all
+:Versions: >= 1
+:Parameters:
+
+::
+
+   struct kvmi_get_page_access {
+   __u16 view;
+   __u16 count;
+   __u32 padding;
+   __u64 gpa[0];
+   };
+
+:Returns:
+
+::
+
+   struct kvmi_error_code;
+   struct kvmi_get_page_access_reply {
+   __u8 access[0];
+   };
+
+Returns the spte access bits (rwx) for an array of ``count`` guest
+physical addresses.
+
+The valid access bits for *KVMI_GET_PAGE_ACCESS* and *KVMI_SET_PAGE_ACCESS*
+are::
+
+   KVMI_PAGE_ACCESS_R
+   KVMI_PAGE_ACCESS_W
+   KVMI_PAGE_ACCESS_X
+
+By default, for any guest physical address, the returned access mode will
+be 'rwx' (all the above bits). If the introspection tool must prevent
+the code execution from a guest page, for example, it should use the
+KVMI_SET_PAGE_ACCESS command to set the 'rw' bits for any guest physical
+addresses contained in that page. Of course, in order to receive
+page fault events when these violations take place, the KVMI_CONTROL_EVENTS
+command must be used to enable this type of event (KVMI_EVENT_PF).
+
+On Intel hardware with multiple EPT views, the ``view`` argument selects the
+EPT view (0 is primary). On all other hardware it must be zero.
+
+:Errors:
+
+* -KVM_EINVAL - the selected SPT view is invalid
+* -KVM_EINVAL - padding is not zero
+* -KVM_EOPNOTSUPP - a SPT view was selected but the hardware doesn't support it
+* -KVM_EAGAIN - the selected vCPU can't be introspected yet
+* -KVM_ENOMEM - not enough memory to allocate the reply
+
 Events
 ==
 
diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
index 121819f9c487..59cf33127b4b 100644
--- a/arch/x86/kvm/kvmi.c
+++ b/arch/x86/kvm/kvmi.c
@@ -183,3 +183,44 @@ void kvmi_arch_update_page_tracking(struct kvm *kvm,
}
}
 }
+
+int kvmi_arch_cmd_get_page_access(struct kvmi *ikvm,
+ const struct kvmi_msg_hdr *msg,
+ const struct kvmi_get_page_access *req,
+ struct kvmi_get_page_access_reply **dest,
+ size_t *dest_size)
+{
+   struct kvmi_get_page_access_reply *rpl = NULL;
+   size_t rpl_size = 0;
+   size_t k, n = req->count;
+   int ec = 0;
+
+   if (req->padding)
+   return -KVM_EINVAL;
+
+   if (msg->size < sizeof(*req) + req->count * sizeof(req->gpa[0]))
+   return -KVM_EINVAL;
+
+   if (req->view != 0) /* TODO */
+   return -KVM_EOPNOTSUPP;
+
+   rpl_size = sizeof(*rpl) + sizeof(rpl->access[0]) * n;
+   rpl = kvmi_msg_alloc_check(rpl_size);
+   if (!rpl)
+   return -KVM_ENOMEM;
+
+   for (k = 0; k < n && ec == 0; k++)
+   ec = kvmi_cmd_get_page_access(ikvm, req->gpa[k],
+ >access[k]);
+
+   if (ec) {
+   kvmi_msg_free(rpl);
+   return ec;
+   }
+
+   *dest = rpl;
+   *dest_size = rpl_size;
+
+   return 0;
+}
+
diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h
index 40a5c304c26f..047436a0bdc0 100644
--- a/include/uapi/linux/kvmi.h
+++ b/include/uapi/linux/kvmi.h
@@ -116,6 +116,17 @@ struct kvmi_get_guest_info_reply {
__u32 padding[3];
 };
 
+struct kvmi_get_page_access {
+   __u16 view;
+   __u16 count;
+   __u32 padding;
+   __u64 gpa[0];
+};
+
+struct kvmi_get_page_access_reply {
+   __u8 access[0];
+};
+
 struct kvmi_get_vcpu_info_reply {
__u64 tsc_speed;
 };
diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
index 0264115

1 2 3 4 5 6 7 >

1 - 100 of 634 matches

Mail list logo