date:20110106

Hi,
I'd like to ask for advice with following problem.
I have windows 2008 terminal server guest running on 2.6.36 x86_64
host (kvm 0.13.0).
guest has 4GB of RAM, 40GB storage on top of LVM volume and two cores. 
So far everything was running fine, but during periodic maintenance 
I wanted to force chkdisk after reboot.
So windows started checking disk integrity, but the problem is, that
it's waaay too slow - after ~12 hours, it's still running and seeems
like it'll take ages to finish.
Both CPU cores seem to be fully loaded.
Is there some way I could check why it's taking so long, and fix
it eventually?
can I use kvm_trace to achieve this task? how?
I'll be very gratefull for any help...
with best regards
nik

-- 
-
Ing. Nikola CIPRICH
LinuxBox.cz, s.r.o.
28. rijna 168, 709 01 Ostrava

tel.:   +420 596 603 142
fax:+420 596 621 273
mobil:  +420 777 093 799
www.linuxbox.cz

mobil servis: +420 737 238 656
email servis: ser...@linuxbox.cz
-
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] Re: [PATCH 09/21] Introduce event-tap.

2011-01-06 Thread Yoshiaki Tamura

2011/1/4 Michael S. Tsirkin m...@redhat.com:
 On Tue, Jan 04, 2011 at 10:45:13PM +0900, Yoshiaki Tamura wrote:
 2011/1/4 Michael S. Tsirkin m...@redhat.com:
  On Tue, Jan 04, 2011 at 09:20:53PM +0900, Yoshiaki Tamura wrote:
  2011/1/4 Michael S. Tsirkin m...@redhat.com:
   On Tue, Jan 04, 2011 at 08:02:54PM +0900, Yoshiaki Tamura wrote:
   2010/11/29 Stefan Hajnoczi stefa...@gmail.com:
On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
tamura.yoshi...@lab.ntt.co.jp wrote:
event-tap controls when to start FT transaction, and provides proxy
functions to called from net/block devices.  While FT transaction, 
it
queues up net/block requests, and flush them when the transaction 
gets
completed.
   
Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp
Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp
---
 Makefile.target |    1 +
 block.h         |    9 +
 event-tap.c     |  794 
+++
 event-tap.h     |   34 +++
 net.h           |    4 +
 net/queue.c     |    1 +
 6 files changed, 843 insertions(+), 0 deletions(-)
 create mode 100644 event-tap.c
 create mode 100644 event-tap.h
   
event_tap_state is checked at the beginning of several functions.  If
there is an unexpected state the function silently returns.  Should
these checks really be assert() so there is an abort and backtrace if
the program ever reaches this state?
   
+typedef struct EventTapBlkReq {
+    char *device_name;
+    int num_reqs;
+    int num_cbs;
+    bool is_multiwrite;
   
Is multiwrite logging necessary?  If event tap is called from within
the block layer then multiwrite is turned into one or more
bdrv_aio_writev() calls.
   
+static void event_tap_replay(void *opaque, int running, int reason)
+{
+    EventTapLog *log, *next;
+
+    if (!running) {
+        return;
+    }
+
+    if (event_tap_state != EVENT_TAP_LOAD) {
+        return;
+    }
+
+    event_tap_state = EVENT_TAP_REPLAY;
+
+    QTAILQ_FOREACH(log, event_list, node) {
+        EventTapBlkReq *blk_req;
+
+        /* event resume */
+        switch (log-mode  ~EVENT_TAP_TYPE_MASK) {
+        case EVENT_TAP_NET:
+            event_tap_net_flush(log-net_req);
+            break;
+        case EVENT_TAP_BLK:
+            blk_req = log-blk_req;
+            if ((log-mode  EVENT_TAP_TYPE_MASK) == 
EVENT_TAP_IOPORT) {
+                switch (log-ioport.index) {
+                case 0:
+                    cpu_outb(log-ioport.address, 
log-ioport.data);
+                    break;
+                case 1:
+                    cpu_outw(log-ioport.address, 
log-ioport.data);
+                    break;
+                case 2:
+                    cpu_outl(log-ioport.address, 
log-ioport.data);
+                    break;
+                }
+            } else {
+                /* EVENT_TAP_MMIO */
+                cpu_physical_memory_rw(log-mmio.address,
+                                       log-mmio.buf,
+                                       log-mmio.len, 1);
+            }
+            break;
   
Why are net tx packets replayed at the net level but blk requests are
replayed at the pio/mmio level?
   
I expected everything to replay either as pio/mmio or as net/block.
  
   Stefan,
  
   After doing some heavy load tests, I realized that we have to
   take a hybrid approach to replay for now.  This is because when a
   device moves to the next state (e.g. virtio decreases inuse) is
   different between net and block.  For example, virtio-net
   decreases inuse upon returning from the net layer,
   but virtio-blk
   does that inside of the callback.
  
   For TX, virtio-net calls virtqueue_push from virtio_net_tx_complete.
   For RX, virtio-net calls virtqueue_flush from virtio_net_receive.
   Both are invoked from a callback.
  
   If we only use pio/mmio
   replay, even though event-tap tries to replay net requests, some
   get lost because the state has proceeded already.
  
   It seems that all you need to do to avoid this is to
   delay the callback?
 
  Yeah, if it's possible.  But if you take a look at virtio-net,
  you'll see that virtio_push is called immediately after calling
  qemu_sendv_packet
  while virtio-blk does that in the callback.
 
  This is only if the packet was sent immediately.
  I was referring to the case where the packet is queued.

 I see.  I usually don't see packets get queued in the net layer.
 What would be the effect to devices?  Restraint sending packets?

 Yes.

 
  
   This doesn't
   happen with block, because the state is still old enough to
   replay.  Note that using hybrid approach won't cause duplicated
   requests on the secondary.
  
   An assumption

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow

2011-01-06 Thread Stefan Hajnoczi

On Thu, Jan 6, 2011 at 7:48 AM, Nikola Ciprich extmaill...@linuxbox.cz wrote:
 So windows started checking disk integrity, but the problem is, that
 it's waaay too slow - after ~12 hours, it's still running and seeems
 like it'll take ages to finish.

Please post your KVM command-line.

Have you run storage benchmarks on the host to check what sort of
maximum I/O performance you can expect?  Do you have a RAID setup
underneath LVM?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow


On 01/06/2011 09:48 AM, Nikola Ciprich wrote:

Hi,
I'd like to ask for advice with following problem.
I have windows 2008 terminal server guest running on 2.6.36 x86_64
host (kvm 0.13.0).
guest has 4GB of RAM, 40GB storage on top of LVM volume and two cores.
So far everything was running fine, but during periodic maintenance
I wanted to force chkdisk after reboot.
So windows started checking disk integrity, but the problem is, that
it's waaay too slow - after ~12 hours, it's still running and seeems
like it'll take ages to finish.
Both CPU cores seem to be fully loaded.
Is there some way I could check why it's taking so long, and fix
it eventually?
can I use kvm_trace to achieve this task? how?


Let's start with a few 'kvm_stat -1' snapshots while this is going on.

http://git.kernel.org/?p=virt/kvm/qemu-kvm.git;a=blob_plain;f=kvm/kvm_stat;hb=HEAD

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow

Hello Stefan!
 Please post your KVM command-line.
/usr/bin/qemu-kvm -S -M pc-0.13 -enable-kvm -m 4096 -smp 
2,sockets=2,cores=1,threads=1 -name vmwts02 -uuid 
1e501300-dc48-11df-a690-00304834195b -nodefconfig -nodefaults -chardev 
socket,id=monitor,path=/var/lib/libvirt/qemu/vmwts02.monitor,server,nowait -mon 
chardev=monitor,mode=readline -rtc base=localtime -boot c -drive 
file=/dev/vgshared/vmwts02-1,if=none,id=drive-ide0-0-0,format=raw -device 
ide-drive,bus=ide.0,unit=0,drive=drive-ide0-0-0,id=ide0-0-0 -drive 
if=none,media=cdrom,id=drive-ide0-1-0,readonly=on,format=raw -device 
ide-drive,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0 -netdev 
tap,fd=22,id=hostnet0 -device 
rtl8139,netdev=hostnet0,id=net0,mac=00:16:3e:61:01:00,bus=pci.0,addr=0x3 -usb 
-vnc 0.0.0.0:30801 -vga cirrus -device 
virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x4

I see I could disable cache for storage, but I don't want to kill
fsck now just to test if it helps (and I guess it shouldn't make
such a difference).

 Have you run storage benchmarks on the host to check what sort of
 maximum I/O performance you can expect?  Do you have a RAID setup
 underneath LVM?
Not for windows, but in general it is running quite fast, only the chkdsk
seems to be bad. In other VMs (linux), I'm achieving write speeds 40MB/s.
Storage configuration is a bit comples, it's DRBD replicated storage,
on top of it sits clustered LVM and KVMS use logical volumes on top of it.
but as I said, overall performance is OK.

 
 Stefan
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

-- 
-
Ing. Nikola CIPRICH
LinuxBox.cz, s.r.o.
28. rijna 168, 709 01 Ostrava

tel.:   +420 596 603 142
fax:+420 596 621 273
mobil:  +420 777 093 799
www.linuxbox.cz

mobil servis: +420 737 238 656
email servis: ser...@linuxbox.cz
-
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow

Hello Avi!
On Thu, Jan 06, 2011 at 11:08:32AM +0200, Avi Kivity wrote:
 Let's start with a few 'kvm_stat -1' snapshots while this is going on.

here it is, but note that there are few more machines running on this host.
but they're almost idle in contrast to this windows one, so I hope it's not 
problem.

kvm_ack_irq  299   290
kvm_age_page  7169
kvm_apic1126  1087
kvm_apic_accept_irq  453   438
kvm_apic_ipi 236   228
kvm_cpuid  0 0
kvm_cr   780   772
kvm_emulate_insn   46181 44982
kvm_entry  55012 53867
kvm_exit   55182 53884
kvm_exit(APIC_ACCESS)  3 0
kvm_exit(CPUID)3 0
kvm_exit(CR_ACCESS)  784   772
kvm_exit(DR_ACCESS)3 0
kvm_exit(EPT_MISCONFIG) 4 0
kvm_exit(EPT_VIOLATION) 4 0
kvm_exit(EXCEPTION_NMI) 50194 49039
kvm_exit(EXTERNAL_INTERRUPT)  2314  2213
kvm_exit(HLT)260   248
kvm_exit(INVALID_STATE) 3 0
kvm_exit(INVLPG) 10398
kvm_exit(IO_INSTRUCTION)  1538  1534
kvm_exit(MCE_DURING_VMENTRY) 5 0
kvm_exit(MONITOR_INSTRUCTION) 3 0
kvm_exit(MSR_READ) 5 0
kvm_exit(MSR_WRITE)4 0
kvm_exit(MWAIT_INSTRUCTION) 3 0
kvm_exit(NMI_WINDOW)   4 0
kvm_exit(PAUSE_INSTRUCTION) 4 0
kvm_exit(PENDING_INTERRUPT)12 7
kvm_exit(RDPMC)4 0
kvm_exit(RDTSC)4 0
kvm_exit(TASK_SWITCH)  3 0
kvm_exit(TPR_BELOW_THRESHOLD) 6 2
kvm_exit(TRIPLE_FAULT) 5 0
kvm_exit(VMCALL)   4 0
kvm_exit(VMCLEAR)  3 0
kvm_exit(VMLAUNCH) 3 0
kvm_exit(VMOFF)4 0
kvm_exit(VMON) 3 0
kvm_exit(VMPTRLD)  4 0
kvm_exit(VMPTRST)  4 0
kvm_exit(VMREAD)   5 0
kvm_exit(VMRESUME) 3 0
kvm_exit(VMWRITE)  4 0
kvm_exit(WBINVD)   3 0
kvm_exit(XSETBV)   4 0
kvm_fpu  247   243
kvm_hv_hypercall   0 0
kvm_hypercall  0 0
kvm_inj_exception  0 0
kvm_inj_virq 514   500
kvm_invlpga0 0
kvm_ioapic_set_irq   363   353
kvm_mmio   67725 65974
kvm_msi_set_irq0 0
kvm_msr0 0
kvm_nested_intercepts  0 0
kvm_nested_intr_vmexit 0 0
kvm_nested_vmexit  0 0
kvm_nested_vmexit_inject 0 0
kvm_nested_vmrun   0 0
kvm_page_fault 50200 48785
kvm_pic_set_irq  363   353
kvm_pio 1541  1541
kvm_set_irq  363   353
kvm_skinit 0 0



 http://git.kernel.org/?p=virt/kvm/qemu-kvm.git;a=blob_plain;f=kvm/kvm_stat;hb=HEAD

 -- 
 error compiling committee.c: too many arguments to function

 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html


-- 
-
Ing. Nikola CIPRICH
LinuxBox.cz, s.r.o.
28. rijna 168, 709 01 Ostrava

tel.:   +420 596 603 142
fax:+420 596 621 273
mobil:  +420 777 093 799
www.linuxbox.cz

mobil servis: +420 737 238 656
email servis: ser...@linuxbox.cz
-
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow


On 01/06/2011 11:20 AM, Nikola Ciprich wrote:

Hello Avi!
On Thu, Jan 06, 2011 at 11:08:32AM +0200, Avi Kivity wrote:
  Let's start with a few 'kvm_stat -1' snapshots while this is going on.

here it is, but note that there are few more machines running on this host.
but they're almost idle in contrast to this windows one, so I hope it's not 
problem.



kvm_cr   780   772
kvm_emulate_insn   46181 44982
kvm_entry  55012 53867
kvm_exit   55182 53884


It's emulating quite a bit.  Let's see why.

- install udis86 and udis86-devel
- build and install 
git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/trace-cmd.git
- run trace-cmd record -e kvm -b 10 -P pid1 -P pid2, ctrl-C after a 
few seconds (pid1/pid2 are thread ids from 'info cpus' of the bad guest, 
plus the pid of the qemu process itself)

- post the resulting trace.dat somewhere


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow


On 01/06/2011 11:27 AM, Avi Kivity wrote:

On 01/06/2011 11:20 AM, Nikola Ciprich wrote:

Hello Avi!
On Thu, Jan 06, 2011 at 11:08:32AM +0200, Avi Kivity wrote:
  Let's start with a few 'kvm_stat -1' snapshots while this is going 
on.


here it is, but note that there are few more machines running on this 
host.
but they're almost idle in contrast to this windows one, so I hope 
it's not problem.



kvm_cr   780   772
kvm_emulate_insn   46181 44982
kvm_entry  55012 53867
kvm_exit   55182 53884


It's emulating quite a bit.  Let's see why.

- install udis86 and udis86-devel
- build and install 
git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/trace-cmd.git
- run trace-cmd record -e kvm -b 10 -P pid1 -P pid2, ctrl-C after 
a few seconds (pid1/pid2 are thread ids from 'info cpus' of the bad 
guest, plus the pid of the qemu process itself)

- post the resulting trace.dat somewhere



btw, that 10 is 400MB of nonswappable kernel memory per cpu, so if 
you have lots of cpus and not enough memory, adjust it down.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Flow Control and Port Mirroring Revisited

Hi,

Back in October I reported that I noticed a problem whereby flow control
breaks down when openvswitch is configured to mirror a port[1].

I have (finally) looked into this further and the problem appears to relate
to cloning of skbs, as Jesse Gross originally suspected.

More specifically, in do_execute_actions[2] the first n-1 times that an skb
needs to be transmitted it is cloned first and the final time the original
skb is used.

In the case that there is only one action, which is the normal case, then
the original skb will be used. But in the case of mirroring the cloning
comes into effect. And in my case the cloned skb seems to go to the (slow)
eth1 interface while the original skb goes to the (fast) dummy0 interface
that I set up to be a mirror. The result is that dummy0 paces the flow,
and its a cracking pace at that.

As an experiment I hacked do_execute_actions() to use the original skb
for the first action instead of the last one.  In my case the result was
that eth1 paces the flow, and things work reasonably nicely.

Well, sort of. Things work well for non-GSO skbs but extremely poorly for
GSO skbs where only 3 (yes 3, not 3%) end up at the remote host running
netserv. I'm unsure why, but I digress.

It seems to me that my hack illustrates the point that the flow ends up
being paced by one interface. However I think that what would be
desirable is that the flow is paced by the slowest link. Unfortunately
I'm unsure how to achieve that.

One idea that I had was to skb_get() the original skb each time it is
cloned - that is easy enough. But unfortunately it seems to me that
approach would require some sort of callback mechanism in kfree_skb() so
that the cloned skbs can kfree_skb() the original skb.

Ideas would be greatly appreciated.

[1] 
http://openvswitch.org/pipermail/dev_openvswitch.org/2010-October/003806.html
[2] 
http://openvswitch.org/cgi-bin/gitweb.cgi?p=openvswitch;a=blob;f=datapath/actions.c;h=5e16143ca402f7da0ee8fc18ee5eb16c3b7598e6;hb=HEAD
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Qemu-devel] Re: [PATCH 09/21] Introduce event-tap.

On Thu, Jan 06, 2011 at 05:47:27PM +0900, Yoshiaki Tamura wrote:
 2011/1/4 Michael S. Tsirkin m...@redhat.com:
  On Tue, Jan 04, 2011 at 10:45:13PM +0900, Yoshiaki Tamura wrote:
  2011/1/4 Michael S. Tsirkin m...@redhat.com:
   On Tue, Jan 04, 2011 at 09:20:53PM +0900, Yoshiaki Tamura wrote:
   2011/1/4 Michael S. Tsirkin m...@redhat.com:
On Tue, Jan 04, 2011 at 08:02:54PM +0900, Yoshiaki Tamura wrote:
2010/11/29 Stefan Hajnoczi stefa...@gmail.com:
 On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
 tamura.yoshi...@lab.ntt.co.jp wrote:
 event-tap controls when to start FT transaction, and provides 
 proxy
 functions to called from net/block devices.  While FT 
 transaction, it
 queues up net/block requests, and flush them when the transaction 
 gets
 completed.

 Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp
 Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp
 ---
  Makefile.target |    1 +
  block.h         |    9 +
  event-tap.c     |  794 
 +++
  event-tap.h     |   34 +++
  net.h           |    4 +
  net/queue.c     |    1 +
  6 files changed, 843 insertions(+), 0 deletions(-)
  create mode 100644 event-tap.c
  create mode 100644 event-tap.h

 event_tap_state is checked at the beginning of several functions.  
 If
 there is an unexpected state the function silently returns.  Should
 these checks really be assert() so there is an abort and backtrace 
 if
 the program ever reaches this state?

 +typedef struct EventTapBlkReq {
 +    char *device_name;
 +    int num_reqs;
 +    int num_cbs;
 +    bool is_multiwrite;

 Is multiwrite logging necessary?  If event tap is called from 
 within
 the block layer then multiwrite is turned into one or more
 bdrv_aio_writev() calls.

 +static void event_tap_replay(void *opaque, int running, int 
 reason)
 +{
 +    EventTapLog *log, *next;
 +
 +    if (!running) {
 +        return;
 +    }
 +
 +    if (event_tap_state != EVENT_TAP_LOAD) {
 +        return;
 +    }
 +
 +    event_tap_state = EVENT_TAP_REPLAY;
 +
 +    QTAILQ_FOREACH(log, event_list, node) {
 +        EventTapBlkReq *blk_req;
 +
 +        /* event resume */
 +        switch (log-mode  ~EVENT_TAP_TYPE_MASK) {
 +        case EVENT_TAP_NET:
 +            event_tap_net_flush(log-net_req);
 +            break;
 +        case EVENT_TAP_BLK:
 +            blk_req = log-blk_req;
 +            if ((log-mode  EVENT_TAP_TYPE_MASK) == 
 EVENT_TAP_IOPORT) {
 +                switch (log-ioport.index) {
 +                case 0:
 +                    cpu_outb(log-ioport.address, 
 log-ioport.data);
 +                    break;
 +                case 1:
 +                    cpu_outw(log-ioport.address, 
 log-ioport.data);
 +                    break;
 +                case 2:
 +                    cpu_outl(log-ioport.address, 
 log-ioport.data);
 +                    break;
 +                }
 +            } else {
 +                /* EVENT_TAP_MMIO */
 +                cpu_physical_memory_rw(log-mmio.address,
 +                                       log-mmio.buf,
 +                                       log-mmio.len, 1);
 +            }
 +            break;

 Why are net tx packets replayed at the net level but blk requests 
 are
 replayed at the pio/mmio level?

 I expected everything to replay either as pio/mmio or as net/block.
   
Stefan,
   
After doing some heavy load tests, I realized that we have to
take a hybrid approach to replay for now.  This is because when a
device moves to the next state (e.g. virtio decreases inuse) is
different between net and block.  For example, virtio-net
decreases inuse upon returning from the net layer,
but virtio-blk
does that inside of the callback.
   
For TX, virtio-net calls virtqueue_push from virtio_net_tx_complete.
For RX, virtio-net calls virtqueue_flush from virtio_net_receive.
Both are invoked from a callback.
   
If we only use pio/mmio
replay, even though event-tap tries to replay net requests, some
get lost because the state has proceeded already.
   
It seems that all you need to do to avoid this is to
delay the callback?
  
   Yeah, if it's possible.  But if you take a look at virtio-net,
   you'll see that virtio_push is called immediately after calling
   qemu_sendv_packet
   while virtio-blk does that in the callback.
  
   This is only if the packet was sent immediately.
   I was referring to the case where the packet is queued.
 
  I see.  I usually don't see packets get queued in the net layer.
  What would be the effect to devices?

Re: [Qemu-devel] Re: [PATCH 09/21] Introduce event-tap.

2011-01-06 Thread Yoshiaki Tamura

2011/1/6 Michael S. Tsirkin m...@redhat.com:
 On Thu, Jan 06, 2011 at 05:47:27PM +0900, Yoshiaki Tamura wrote:
 2011/1/4 Michael S. Tsirkin m...@redhat.com:
  On Tue, Jan 04, 2011 at 10:45:13PM +0900, Yoshiaki Tamura wrote:
  2011/1/4 Michael S. Tsirkin m...@redhat.com:
   On Tue, Jan 04, 2011 at 09:20:53PM +0900, Yoshiaki Tamura wrote:
   2011/1/4 Michael S. Tsirkin m...@redhat.com:
On Tue, Jan 04, 2011 at 08:02:54PM +0900, Yoshiaki Tamura wrote:
2010/11/29 Stefan Hajnoczi stefa...@gmail.com:
 On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
 tamura.yoshi...@lab.ntt.co.jp wrote:
 event-tap controls when to start FT transaction, and provides 
 proxy
 functions to called from net/block devices.  While FT 
 transaction, it
 queues up net/block requests, and flush them when the 
 transaction gets
 completed.

 Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp
 Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp
 ---
  Makefile.target |    1 +
  block.h         |    9 +
  event-tap.c     |  794 
 +++
  event-tap.h     |   34 +++
  net.h           |    4 +
  net/queue.c     |    1 +
  6 files changed, 843 insertions(+), 0 deletions(-)
  create mode 100644 event-tap.c
  create mode 100644 event-tap.h

 event_tap_state is checked at the beginning of several functions. 
  If
 there is an unexpected state the function silently returns.  
 Should
 these checks really be assert() so there is an abort and 
 backtrace if
 the program ever reaches this state?

 +typedef struct EventTapBlkReq {
 +    char *device_name;
 +    int num_reqs;
 +    int num_cbs;
 +    bool is_multiwrite;

 Is multiwrite logging necessary?  If event tap is called from 
 within
 the block layer then multiwrite is turned into one or more
 bdrv_aio_writev() calls.

 +static void event_tap_replay(void *opaque, int running, int 
 reason)
 +{
 +    EventTapLog *log, *next;
 +
 +    if (!running) {
 +        return;
 +    }
 +
 +    if (event_tap_state != EVENT_TAP_LOAD) {
 +        return;
 +    }
 +
 +    event_tap_state = EVENT_TAP_REPLAY;
 +
 +    QTAILQ_FOREACH(log, event_list, node) {
 +        EventTapBlkReq *blk_req;
 +
 +        /* event resume */
 +        switch (log-mode  ~EVENT_TAP_TYPE_MASK) {
 +        case EVENT_TAP_NET:
 +            event_tap_net_flush(log-net_req);
 +            break;
 +        case EVENT_TAP_BLK:
 +            blk_req = log-blk_req;
 +            if ((log-mode  EVENT_TAP_TYPE_MASK) == 
 EVENT_TAP_IOPORT) {
 +                switch (log-ioport.index) {
 +                case 0:
 +                    cpu_outb(log-ioport.address, 
 log-ioport.data);
 +                    break;
 +                case 1:
 +                    cpu_outw(log-ioport.address, 
 log-ioport.data);
 +                    break;
 +                case 2:
 +                    cpu_outl(log-ioport.address, 
 log-ioport.data);
 +                    break;
 +                }
 +            } else {
 +                /* EVENT_TAP_MMIO */
 +                cpu_physical_memory_rw(log-mmio.address,
 +                                       log-mmio.buf,
 +                                       log-mmio.len, 1);
 +            }
 +            break;

 Why are net tx packets replayed at the net level but blk requests 
 are
 replayed at the pio/mmio level?

 I expected everything to replay either as pio/mmio or as 
 net/block.
   
Stefan,
   
After doing some heavy load tests, I realized that we have to
take a hybrid approach to replay for now.  This is because when a
device moves to the next state (e.g. virtio decreases inuse) is
different between net and block.  For example, virtio-net
decreases inuse upon returning from the net layer,
but virtio-blk
does that inside of the callback.
   
For TX, virtio-net calls virtqueue_push from virtio_net_tx_complete.
For RX, virtio-net calls virtqueue_flush from virtio_net_receive.
Both are invoked from a callback.
   
If we only use pio/mmio
replay, even though event-tap tries to replay net requests, some
get lost because the state has proceeded already.
   
It seems that all you need to do to avoid this is to
delay the callback?
  
   Yeah, if it's possible.  But if you take a look at virtio-net,
   you'll see that virtio_push is called immediately after calling
   qemu_sendv_packet
   while virtio-blk does that in the callback.
  
   This is only if the packet was sent immediately.
   I was referring to the case where the packet is queued.
 
  I see.  I usually don't see packets get queued in the

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow

 - run trace-cmd record -e kvm -b 10 -P pid1 -P pid2, ctrl-C after a  
seems like it's not possible to specify multiple pids, so
I've run 4 commands in parallel. Also I can't get monitor information
since vm is started using libvirt, so I've just used all machine's qemu-kvm
pids..
hope it's OK
here's the trace:
http://nelide.cz/downloads/nik/trace.tar.bz2
n.

-- 
-
Ing. Nikola CIPRICH
LinuxBox.cz, s.r.o.
28. rijna 168, 709 01 Ostrava

tel.:   +420 596 603 142
fax:+420 596 621 273
mobil:  +420 777 093 799
www.linuxbox.cz

mobil servis: +420 737 238 656
email servis: ser...@linuxbox.cz
-
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

KVM TSC trapping

2011-01-06 Thread Zachary Amsden

On top of my last patchset, I now implement TSC trapping and
a flexible migration scheme for maintaining stable TSC across
migration.  Since it is administratively configured, it can
be selectively enabled only for VMs which require it.  In
particular, VMs which use KVM clock probably do not want it.

We will need some administrative controls in qemu and in libvirt
to make full use of this, but the design allows flexible and
fairly simple control.  In addition, users can override these
settings with module parameters to forcibly disable or enable
TSC trapping globally, for testing, workarounds, or performance.

This may be slightly out of date, but I would like to solicit
feedback about the patches, #2 especially.

--Zach

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h

Then it can be used by other struct in kvm_host.h

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 include/linux/kvm_host.h |   23 +++
 virt/kvm/iodev.h |   25 +
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b5021db..7d313e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -98,6 +98,29 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, 
gfn_t gfn,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+   int (*read)(struct kvm_io_device *this,
+   gpa_t addr,
+   int len,
+   void *val);
+   int (*write)(struct kvm_io_device *this,
+gpa_t addr,
+int len,
+const void *val);
+   void (*destructor)(struct kvm_io_device *this);
+};
+
+struct kvm_io_device {
+   const struct kvm_io_device_ops *ops;
+};
+
 struct kvm_vcpu {
struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 12fd3ca..d1f5651 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,32 +17,9 @@
 #define __KVM_IODEV_H__
 
 #include linux/kvm_types.h
+#include linux/kvm_host.h
 #include asm/errno.h
 
-struct kvm_io_device;
-
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-   int (*read)(struct kvm_io_device *this,
-   gpa_t addr,
-   int len,
-   void *val);
-   int (*write)(struct kvm_io_device *this,
-gpa_t addr,
-int len,
-const void *val);
-   void (*destructor)(struct kvm_io_device *this);
-};
-
-
-struct kvm_io_device {
-   const struct kvm_io_device_ops *ops;
-};
-
 static inline void kvm_iodevice_init(struct kvm_io_device *dev,
 const struct kvm_io_device_ops *ops)
 {
-- 
1.7.0.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/3 v7] MSI-X MMIO support for KVM

Change from v6:
1. Discard PBA support. But we can still add it later.
2. Fix one memory reference bug
3. Add automatically MMIO unregister after device was deassigned.
4. Update according to Avi's comments.
5. Add documents for new API.

Notice this patchset depends on two PCI patches named:

PCI: MSI: Move MSI-X entry definition to pci_regs.h
PCI: Add mask bit definition for MSI-X table

These two patches are in the Jesse's pci-2.6 tree. Do I need to repost them?

Sheng Yang (3):
  KVM: Move struct kvm_io_device to kvm_host.h
  KVM: Emulate MSI-X table in kernel
  KVM: Add documents for MSI-X MMIO API

 Documentation/kvm/api.txt |   41 +++
 arch/x86/kvm/Makefile |2 +-
 arch/x86/kvm/x86.c|8 +-
 include/linux/kvm.h   |   21 
 include/linux/kvm_host.h  |   48 
 virt/kvm/assigned-dev.c   |   44 +++
 virt/kvm/iodev.h  |   25 +
 virt/kvm/kvm_main.c   |   38 ++-
 virt/kvm/msix_mmio.c  |  284 +
 virt/kvm/msix_mmio.h  |   25 
 10 files changed, 505 insertions(+), 31 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/3] KVM: Emulate MSI-X table in kernel

Then we can support mask bit operation of assigned devices now.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 arch/x86/kvm/Makefile|2 +-
 arch/x86/kvm/x86.c   |8 +-
 include/linux/kvm.h  |   21 
 include/linux/kvm_host.h |   25 
 virt/kvm/assigned-dev.c  |   44 +++
 virt/kvm/kvm_main.c  |   38 ++-
 virt/kvm/msix_mmio.c |  284 ++
 virt/kvm/msix_mmio.h |   25 
 8 files changed, 440 insertions(+), 7 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
 kvm-y  += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
coalesced_mmio.o irq_comm.o eventfd.o \
-   assigned-dev.o)
+   assigned-dev.o msix_mmio.o)
 kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa708c9..89bf12c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
+   case KVM_CAP_MSIX_MMIO:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long 
addr,
   struct kvm_vcpu *vcpu)
 {
gpa_t gpa;
+   int r;
 
gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned 
long addr,
 
 mmio:
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+   r = vcpu_mmio_write(vcpu, gpa, bytes, val);
/*
 * Is this MMIO handled locally?
 */
-   if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+   if (!r)
return X86EMUL_CONTINUE;
 
vcpu-mmio_needed = 1;
-   vcpu-run-exit_reason = KVM_EXIT_MMIO;
+   vcpu-run-exit_reason = (r == -ENOTSYNC) ?
+   KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa;
vcpu-run-mmio.len = vcpu-mmio_size = bytes;
vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..ad9df4b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI  16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI  18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -672,6 +674,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG_IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO_IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2  _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2  _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV(1  0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE  (1  8)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK   0xff00
+struct kvm_msix_mmio_user {
+   __u32 dev_id;
+   __u16 type;
+   __u16 max_entries_nr;
+   __u64 base_addr;
+   __u64 base_va;
+   __u64 flags;
+   __u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d313e0..c10670c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -233,6 +233,27 @@ struct kvm_memslots {
KVM_PRIVATE_MEM_SLOTS];
 };
 
+#define KVM_MSIX_MMIO_MAX32
+
+struct kvm_msix_mmio {
+   u32 dev_id;
+   u16 type;
+   u16 max_entries_nr;
+   u64 flags;
+   gpa_t table_base_addr;
+   hva_t table_base_va;
+   gpa_t pba_base_addr;
+   hva_t

[PATCH 3/3] KVM: Add documents for MSI-X MMIO API


Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 Documentation/kvm/api.txt |   41 +
 1 files changed, 41 insertions(+), 0 deletions(-)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index e1a9297..4978b94 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1263,6 +1263,47 @@ struct kvm_assigned_msix_entry {
__u16 padding[3];
 };
 
+4.54 KVM_REGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API indicates an MSI-X MMIO address of a guest device. Then all MMIO
+operation would be handled by kernel. When necessary(e.g. MSI data/address
+changed), KVM would exit to userspace using KVM_EXIT_MSIX_ROUTING_UPDATE to
+indicate the MMIO modification and require userspace to update IRQ routing
+table.
+
+struct kvm_msix_mmio_user {
+   __u32 dev_id;
+   __u16 type; /* Device type and MMIO address type */
+   __u16 max_entries_nr;   /* Maximum entries supported */
+   __u64 base_addr;/* Guest physical address of MMIO */
+   __u64 base_va;  /* Host virtual address of MMIO mapping */
+   __u64 flags;/* Reserved for now */
+   __u64 reserved[4];
+};
+
+Current device type can be:
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV(1  0)
+
+Current MMIO type can be:
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE  (1  8)
+
+4.55 KVM_UNREGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API would unregister the specific MSI-X MMIO, indicated by dev_id and
+type fields of struct kvm_msix_mmio_user.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
-- 
1.7.0.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow


On 01/06/2011 11:42 AM, Nikola Ciprich wrote:

  - run trace-cmd record -e kvm -b 10 -P pid1 -P pid2, ctrl-C after a
seems like it's not possible to specify multiple pids, so


Did you get 'overrun: something' reports from trace-cmd, where something 
!= 0?


If you're not sure, please run the trace again.  Also try adding '-r 10' 
to the command line.



I've run 4 commands in parallel. Also I can't get monitor information
since vm is started using libvirt, so I've just used all machine's qemu-kvm
pids..


Dan, is there a way to hijack the monitor so we can run some commands on 
it?  Things like 'info registers' and disassembly.



hope it's OK
here's the trace:
http://nelide.cz/downloads/nik/trace.tar.bz2
n.



Looks like vcpu 1 is spinning; perhaps that's normal.  If you get hold 
of the monitor, please disassemble around 0xf80001575d59.


vcpu 0 is busy writing to vga (can you confirm)? looks like bank 
switching is hitting synchronize_srcu_expedited(), which is known slow.  
Unfortunately that only gets better in 2.6.38.


You can try applying 
http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git;a=commit;h=46fdb0937f26124700fc9fc80da4776330cc00d3 
and see if it helps.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/4 v7] qemu-kvm: MSI-X MMIO support for assigned device

Update with kernel patches v7.

Sheng Yang (4):
  qemu-kvm: device assignment: Enabling MSI-X according to the entries'
mask bit
  qemu-kvm: Ioctl for MSIX MMIO support
  qemu-kvm: Header file update for MSI-X MMIO support
  qemu-kvm: MSI-X MMIO support for assigned device

 hw/device-assignment.c  |  275 --
 hw/device-assignment.h  |5 +-
 kvm/include/linux/kvm.h |   21 
 qemu-kvm.c  |   54 +
 qemu-kvm.h  |   18 +++
 5 files changed, 336 insertions(+), 37 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/4] qemu-kvm: Header file update for MSI-X MMIO support


Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 kvm/include/linux/kvm.h |   21 +
 1 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/kvm/include/linux/kvm.h b/kvm/include/linux/kvm.h
index e46729e..7b6d5b9 100644
--- a/kvm/include/linux/kvm.h
+++ b/kvm/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI  16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI  18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -530,6 +531,7 @@ struct kvm_enable_cap {
 #ifdef __KVM_HAVE_XCRS
 #define KVM_CAP_XCRS 56
 #endif
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -660,6 +662,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG_IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO_IOW(KVMIO, 0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2  _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2  _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -781,4 +786,20 @@ struct kvm_assigned_msix_entry {
__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV(1  0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE  (1  8)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK   0xff00
+struct kvm_msix_mmio_user {
+   __u32 dev_id;
+   __u16 type;
+   __u16 max_entries_nr;
+   __u64 base_addr;
+   __u64 base_va;
+   __u64 flags;
+   __u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
-- 
1.7.0.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/4] qemu-kvm: MSI-X MMIO support for assigned device


Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 hw/device-assignment.c |   93 +--
 hw/device-assignment.h |3 ++
 qemu-kvm.c |   40 
 qemu-kvm.h |   11 ++
 4 files changed, 135 insertions(+), 12 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index f81050f..bddee2a 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -70,6 +70,11 @@ static void assigned_device_pci_cap_write_config(PCIDevice 
*pci_dev,
 static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
 uint32_t address, int len);
 
+static uint32_t calc_assigned_dev_id(uint16_t seg, uint8_t bus, uint8_t devfn)
+{
+return (uint32_t)seg  16 | (uint32_t)bus  8 | (uint32_t)devfn;
+}
+
 static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
uint32_t addr, int len, uint32_t *val)
 {
@@ -272,6 +277,10 @@ static void assigned_dev_iomem_map(PCIDevice *pci_dev, int 
region_num,
 AssignedDevRegion *region = r_dev-v_addrs[region_num];
 PCIRegion *real_region = r_dev-real_device.regions[region_num];
 int ret = 0;
+#ifdef KVM_CAP_MSIX_MMIO
+int cap_mask = kvm_check_extension(kvm_state, KVM_CAP_MSIX_MMIO);
+struct kvm_msix_mmio_user msix_mmio;
+#endif
 
 DEBUG(e_phys=%08 FMT_PCIBUS  r_virt=%p type=%d len=%08 FMT_PCIBUS  
region_num=%d \n,
   e_phys, region-u.r_virtbase, type, e_size, region_num);
@@ -290,6 +299,23 @@ static void assigned_dev_iomem_map(PCIDevice *pci_dev, int 
region_num,
 
 cpu_register_physical_memory(e_phys + offset,
 TARGET_PAGE_SIZE, r_dev-mmio_index);
+#ifdef KVM_CAP_MSIX_MMIO
+if (cap_mask) {
+r_dev-guest_msix_table_addr = e_phys + offset;
+memset(msix_mmio, 0, sizeof msix_mmio);
+msix_mmio.dev_id = calc_assigned_dev_id(r_dev-h_segnr,
+r_dev-h_busnr, r_dev-h_devfn);
+msix_mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
+   KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+msix_mmio.base_addr = e_phys + offset;
+msix_mmio.base_va = (unsigned long)r_dev-msix_table_page;
+msix_mmio.max_entries_nr = r_dev-max_msix_entries_nr;
+msix_mmio.flags = 0;
+ret = kvm_register_msix_mmio(kvm_context, msix_mmio);
+if (ret)
+fprintf(stderr, fail to register in-kernel msix_mmio!\n);
+}
+#endif
 }
 }
 
@@ -852,11 +878,6 @@ static void free_assigned_device(AssignedDevice *dev)
 }
 }
 
-static uint32_t calc_assigned_dev_id(uint16_t seg, uint8_t bus, uint8_t devfn)
-{
-return (uint32_t)seg  16 | (uint32_t)bus  8 | (uint32_t)devfn;
-}
-
 static void assign_failed_examine(AssignedDevice *dev)
 {
 char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
@@ -1263,6 +1284,8 @@ static int assigned_dev_update_msix_mmio(PCIDevice 
*pci_dev,
 return r;
 }
 
+static int assigned_dev_update_routing_handler(void *opaque, unsigned long 
addr);
+
 static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos)
 {
 struct kvm_assigned_irq assigned_irq_data;
@@ -1486,7 +1509,9 @@ static int assigned_device_pci_cap_init(PCIDevice 
*pci_dev)
 msix_table_entry = pci_get_long(pci_dev-config + pos + 
PCI_MSIX_TABLE);
 bar_nr = msix_table_entry  PCI_MSIX_BIR;
 msix_table_entry = ~PCI_MSIX_BIR;
-dev-msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry;
+dev-msix_table_addr = pci_region[bar_nr].base_addr +
+   msix_table_entry;
+
 dev-max_msix_entries_nr = get_msix_entries_max_nr(dev);
 }
 #endif
@@ -1670,8 +1695,7 @@ static uint32_t msix_mmio_readw(void *opaque, 
target_phys_addr_t addr)
 (8 * (addr  3)))  0x;
 }
 
-static void msix_mmio_writel(void *opaque,
- target_phys_addr_t addr, uint32_t val)
+static void assigned_dev_update_routing(void *opaque, unsigned long addr)
 {
 AssignedDevice *adev = opaque;
 unsigned int offset = addr  0xfff;
@@ -1683,10 +1707,6 @@ static void msix_mmio_writel(void *opaque,
 struct PCIDevice *pci_dev = adev-dev;
 uint8_t cap = pci_find_capability(pci_dev, PCI_CAP_ID_MSIX);
 
-DEBUG(write to MSI-X entry table mmio offset 0x%lx, val 0x%x\n,
-   addr, val);
-memcpy((void *)((char *)page + offset), val, 4);
-
 index = offset / 16;
 
 /* Check if mask bit is being accessed */
@@ -1762,6 +1782,41 @@ static void msix_mmio_writel(void *opaque,
 adev-entry[entry_idx].u.msi.data = msg_data;
 }
 
+static int assigned_dev_update_routing_handler(void *opaque, unsigned long 
addr)
+{
+AssignedDevice *adev = opaque;
+
+if (addr = adev-guest_msix_table_addr 
+

[PATCH 1/4] qemu-kvm: device assignment: Enabling MSI-X according to the entries' mask bit

The old MSI-X enabling method assume the entries are written before MSI-X
enabled, but some OS didn't obey this, e.g. FreeBSD. This patch would fix
this.

Also, according to the PCI spec, mask bit of MSI-X table should be set
after reset.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 hw/device-assignment.c |  188 +---
 hw/device-assignment.h |2 +-
 2 files changed, 162 insertions(+), 28 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 8446cd4..f81050f 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -1141,15 +1141,12 @@ static void assigned_dev_update_msi(PCIDevice *pci_dev, 
unsigned int ctrl_pos)
 #endif
 
 #ifdef KVM_CAP_DEVICE_MSIX
-static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
+
+#define PCI_MSIX_CTRL_MASKBIT  1ul
+static int get_msix_entries_max_nr(AssignedDevice *adev)
 {
-AssignedDevice *adev = container_of(pci_dev, AssignedDevice, dev);
-uint16_t entries_nr = 0, entries_max_nr;
-int pos = 0, i, r = 0;
-uint32_t msg_addr, msg_upper_addr, msg_data, msg_ctrl;
-struct kvm_assigned_msix_nr msix_nr;
-struct kvm_assigned_msix_entry msix_entry;
-void *va = adev-msix_table_page;
+int pos, entries_max_nr;
+PCIDevice *pci_dev = adev-dev;
 
 pos = pci_find_capability(pci_dev, PCI_CAP_ID_MSIX);
 
@@ -1157,20 +1154,48 @@ static int assigned_dev_update_msix_mmio(PCIDevice 
*pci_dev)
 entries_max_nr = PCI_MSIX_TABSIZE;
 entries_max_nr += 1;
 
+return entries_max_nr;
+}
+
+static int assigned_dev_msix_entry_masked(AssignedDevice *adev, int entry)
+{
+uint32_t msg_ctrl;
+void *va = adev-msix_table_page;
+
+memcpy(msg_ctrl, va + entry * 16 + 12, 4);
+return (msg_ctrl  PCI_MSIX_CTRL_MASKBIT);
+}
+
+static int get_msix_valid_entries_nr(AssignedDevice *adev,
+uint16_t entries_max_nr)
+{
+void *va = adev-msix_table_page;
+uint32_t msg_ctrl;
+uint16_t entries_nr = 0;
+int i;
+
 /* Get the usable entry number for allocating */
 for (i = 0; i  entries_max_nr; i++) {
 memcpy(msg_ctrl, va + i * 16 + 12, 4);
-memcpy(msg_data, va + i * 16 + 8, 4);
 /* Ignore unused entry even it's unmasked */
-if (msg_data == 0)
+if (assigned_dev_msix_entry_masked(adev, i))
 continue;
 entries_nr ++;
 }
+return entries_nr;
+}
+
+static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev,
+ uint16_t entries_nr,
+ uint16_t entries_max_nr)
+{
+AssignedDevice *adev = container_of(pci_dev, AssignedDevice, dev);
+int i, r = 0;
+uint32_t msg_addr, msg_upper_addr, msg_data, msg_ctrl;
+struct kvm_assigned_msix_nr msix_nr;
+struct kvm_assigned_msix_entry msix_entry;
+void *va = adev-msix_table_page;
 
-if (entries_nr == 0) {
-fprintf(stderr, MSI-X entry number is zero!\n);
-return -EINVAL;
-}
 msix_nr.assigned_dev_id = calc_assigned_dev_id(adev-h_segnr, 
adev-h_busnr,
   (uint8_t)adev-h_devfn);
 msix_nr.entry_nr = entries_nr;
@@ -1182,6 +1207,8 @@ static int assigned_dev_update_msix_mmio(PCIDevice 
*pci_dev)
 }
 
 free_dev_irq_entries(adev);
+memset(pci_dev-msix_entry_used, 0, KVM_MAX_MSIX_PER_DEV *
+sizeof(*pci_dev-msix_entry_used));
 adev-irq_entries_nr = entries_nr;
 adev-entry = calloc(entries_nr, sizeof(struct kvm_irq_routing_entry));
 if (!adev-entry) {
@@ -1195,10 +1222,10 @@ static int assigned_dev_update_msix_mmio(PCIDevice 
*pci_dev)
 if (entries_nr = msix_nr.entry_nr)
 break;
 memcpy(msg_ctrl, va + i * 16 + 12, 4);
-memcpy(msg_data, va + i * 16 + 8, 4);
-if (msg_data == 0)
+if (assigned_dev_msix_entry_masked(adev, i))
 continue;
 
+memcpy(msg_data, va + i * 16 + 8, 4);
 memcpy(msg_addr, va + i * 16, 4);
 memcpy(msg_upper_addr, va + i * 16 + 4, 4);
 
@@ -1212,17 +1239,18 @@ static int assigned_dev_update_msix_mmio(PCIDevice 
*pci_dev)
 adev-entry[entries_nr].u.msi.address_lo = msg_addr;
 adev-entry[entries_nr].u.msi.address_hi = msg_upper_addr;
 adev-entry[entries_nr].u.msi.data = msg_data;
-DEBUG(MSI-X data 0x%x, MSI-X addr_lo 0x%x\n!, msg_data, msg_addr);
-   kvm_add_routing_entry(adev-entry[entries_nr]);
+DEBUG(MSI-X data 0x%x, MSI-X addr_lo 0x%x!\n, msg_data, msg_addr);
+kvm_add_routing_entry(adev-entry[entries_nr]);
 
 msix_entry.gsi = adev-entry[entries_nr].gsi;
 msix_entry.entry = i;
+pci_dev-msix_entry_used[i] = 1;
 r = kvm_assign_set_msix_entry(kvm_context, msix_entry);
 if (r) {
 fprintf(stderr, fail to set MSI-X entry! %s\n, strerror(-r));
 break;
 }
-DEBUG(MSI-X

Re: Flow Control and Port Mirroring Revisited

2011-01-06 Thread Eric Dumazet

Le jeudi 06 janvier 2011 à 18:33 +0900, Simon Horman a écrit :
 Hi,
 
 Back in October I reported that I noticed a problem whereby flow control
 breaks down when openvswitch is configured to mirror a port[1].
 
 I have (finally) looked into this further and the problem appears to relate
 to cloning of skbs, as Jesse Gross originally suspected.
 
 More specifically, in do_execute_actions[2] the first n-1 times that an skb
 needs to be transmitted it is cloned first and the final time the original
 skb is used.
 
 In the case that there is only one action, which is the normal case, then
 the original skb will be used. But in the case of mirroring the cloning
 comes into effect. And in my case the cloned skb seems to go to the (slow)
 eth1 interface while the original skb goes to the (fast) dummy0 interface
 that I set up to be a mirror. The result is that dummy0 paces the flow,
 and its a cracking pace at that.
 
 As an experiment I hacked do_execute_actions() to use the original skb
 for the first action instead of the last one.  In my case the result was
 that eth1 paces the flow, and things work reasonably nicely.
 
 Well, sort of. Things work well for non-GSO skbs but extremely poorly for
 GSO skbs where only 3 (yes 3, not 3%) end up at the remote host running
 netserv. I'm unsure why, but I digress.
 
 It seems to me that my hack illustrates the point that the flow ends up
 being paced by one interface. However I think that what would be
 desirable is that the flow is paced by the slowest link. Unfortunately
 I'm unsure how to achieve that.
 

Hi Simon !

pacing is done because skb is attached to a socket, and a socket has a
limited (but configurable) sndbuf. sk-sk_wmem_alloc is the current sum
of all truesize skbs in flight.

When you enter something that :

1) Get a clone of the skb, queue the clone to device X
2) queue the original skb to device Y

Then :  Socket sndbuf is not affected at all by device X queue.
This is speed on device Y that matters.

You want to get servo control on both X and Y

You could try to

1) Get a clone of skb
   Attach it to socket too (so that socket get a feedback of final
orphaning for the clone) with skb_set_owner_w()
   queue the clone to device X

Unfortunatly, stacked skb-destructor() makes this possible only for
known destructor (aka sock_wfree())

 One idea that I had was to skb_get() the original skb each time it is
 cloned - that is easy enough. But unfortunately it seems to me that
 approach would require some sort of callback mechanism in kfree_skb() so
 that the cloned skbs can kfree_skb() the original skb.
 
 Ideas would be greatly appreciated.
 
 [1] 
 http://openvswitch.org/pipermail/dev_openvswitch.org/2010-October/003806.html
 [2] 
 http://openvswitch.org/cgi-bin/gitweb.cgi?p=openvswitch;a=blob;f=datapath/actions.c;h=5e16143ca402f7da0ee8fc18ee5eb16c3b7598e6;hb=HEAD
 --


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow

Did you get 'overrun: something' reports from trace-cmd, where something
!= 0?
nope, all entries were 0.

Dan, is there a way to hijack the monitor so we can run some commands on
it? Things like 'info registers' and disassembly.
AFAIK that's intentionally not possible :(
pity..

Looks like vcpu 1 is spinning; perhaps that's normal. If you get hold
of the monitor, please disassemble around 0xf80001575d59.

vcpu 0 is busy writing to vga (can you confirm)? looks like bank
switching is hitting synchronize_srcu_expedited(), which is known slow.
Unfortunately that only gets better in 2.6.38.
I guess it might help if I would just risk killing the machine and run
it without libvirt so we can debug better right?
or even better, I'll try to reproduce on another 2K8 windows on testing
machine and then we can play more with it.
gimme a few minutes please..

You can try applying
http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git;a=commit;h=46fdb0937f26124700fc9fc80da4776330cc00d3

and see if it helps.
I'll also prepare testing kernel including this patch..
BTW Is it just me, or is git.kernel.org pretty slow today?

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

--
-
Ing. Nikola CIPRICH
LinuxBox.cz, s.r.o.
28. rijna 168, 709 01 Ostrava

tel.: +420 596 603 142
fax:+420 596 621 273
mobil: +420 777 093 799
www.linuxbox.cz

mobil servis: +420 737 238 656
email servis: ser...@linuxbox.cz
-
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Re: Flow Control and Port Mirroring Revisited

On Thu, Jan 06, 2011 at 06:33:12PM +0900, Simon Horman wrote:
Hi,

Back in October I reported that I noticed a problem whereby flow control
breaks down when openvswitch is configured to mirror a port[1].

Apropos the UDP flow control. See this
http://www.spinics.net/lists/netdev/msg150806.html
for some problems it introduces.
Unfortunately UDP does not have built-in flow control.
At some level it's just conceptually broken:
it's not present in physical networks so why should
we try and emulate it in a virtual network?

Specifically, when you do:
# netperf -c -4 -t UDP_STREAM -H 172.17.60.218 -l 30 -- -m 1472
You are asking: what happens if I push data faster than it can be received?
But why is this an interesting question?
Ask 'what is the maximum rate at which I can send data with %X packet
loss' or 'what is the packet loss at rate Y Gb/s'. netperf has
-b and -w flags for this. It needs to be configured
with --enable-intervals=yes for them to work.

If you pose the questions this way the problem of pacing
the execution just goes away.

I have (finally) looked into this further and the problem appears to relate
to cloning of skbs, as Jesse Gross originally suspected.

More specifically, in do_execute_actions[2] the first n-1 times that an skb
needs to be transmitted it is cloned first and the final time the original
skb is used.

In the case that there is only one action, which is the normal case, then
the original skb will be used. But in the case of mirroring the cloning
comes into effect. And in my case the cloned skb seems to go to the (slow)
eth1 interface while the original skb goes to the (fast) dummy0 interface
that I set up to be a mirror. The result is that dummy0 paces the flow,
and its a cracking pace at that.

As an experiment I hacked do_execute_actions() to use the original skb
for the first action instead of the last one. In my case the result was
that eth1 paces the flow, and things work reasonably nicely.

Well, sort of. Things work well for non-GSO skbs but extremely poorly for
GSO skbs where only 3 (yes 3, not 3%) end up at the remote host running
netserv. I'm unsure why, but I digress.

It seems to me that my hack illustrates the point that the flow ends up
being paced by one interface. However I think that what would be
desirable is that the flow is paced by the slowest link. Unfortunately
I'm unsure how to achieve that.

What if you have multiple UDP sockets with different targets
in the guest?

One idea that I had was to skb_get() the original skb each time it is
cloned - that is easy enough. But unfortunately it seems to me that
approach would require some sort of callback mechanism in kfree_skb() so
that the cloned skbs can kfree_skb() the original skb.

Ideas would be greatly appreciated.

[1]
http://openvswitch.org/pipermail/dev_openvswitch.org/2010-October/003806.html
[2]
http://openvswitch.org/cgi-bin/gitweb.cgi?p=openvswitch;a=blob;f=datapath/actions.c;h=5e16143ca402f7da0ee8fc18ee5eb16c3b7598e6;hb=HEAD
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Re: [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR


Am 06.01.2011 um 11:10 schrieb Zachary Amsden zams...@redhat.com:

 Use an MSR to allow soft migration to hosts which do not support
 TSC trapping.  Rather than make this a required element of any
 migration protocol, we allow the TSC rate to be exported as a data
 field (useful in its own right), but we also allow a one time write
 of the MSR during VM creation.  The result is that for the common
 use case, no protocol change is required to communicate TSC rate
 to the receiving host.
 
 This allows administrative tools to configure migration policy
 as they see appropriate.  Rather than dictate this policy with the
 KVM implementation, we properly allow migration to hosts which both
 do and do not support setting of the TSC rate on the receiving end.
 If it is wished to not support migration to a host which lacks
 support for the TSC rate feature, that can be coordinated externally.

Isn't there a real hw equivalent of such a register? It might make more sense 
to just implement that then.


Alex

 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX


Am 06.01.2011 um 11:10 schrieb Zachary Amsden zams...@redhat.com:

 Reasons to trap the TSC are numerous, but we want to avoid it as much
 as possible for performance reasons.
 
 We provide two conservative modes via modules parameters and userspace
 hinting.  First, the module can be loaded with tsc_auto=1 as a module
 parameter, which turns on conservative TSC trapping only when it is
 required (when unstable TSC or faster KHZ CPU is detected).
 
 For userspace hinting, we enable trapping only if necessary.  Userspace
 can hint that a VM needs a fixed frequency TSC, and also that SMP
 stability will be required.  In that case, we conservatively turn on
 trapping when it is needed.  In addition, users may now specify the
 desired TSC rate at which to run.  If this rate differs significantly
 from the host rate, trapping will be enabled.
 
 There is also an override control to allow TSC trapping to be turned on
 or off unconditionally for testing.
 
 We indicate to pvclock users that the TSC is being trapped, to allow
 avoiding overhead and directly using RDTSCP (only for SVM).  This
 optimization is not yet implemented.

When migrating, the implementation could switch from non-trapped to trapped, 
making it less attractive. The guest however does not get notified about this 
change. Same for the other way around.

Would it make sense to add a kvmclock interrupt to notify the guest of such a 
change?


Alex

 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/3] Fix a couple of races in vmexit.flat

This patchset fixes a couple of races and speeds up an important function.

Avi Kivity (3):
  smp: fix race in async on_cpu()
  smp: speed up cpu_count()
  vmexit: fix race in joining smp tests

 lib/x86/smp.c |   33 +++--
 x86/vmexit.c  |9 +
 2 files changed, 28 insertions(+), 14 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/3] smp: speed up cpu_count()

cpu_count() is used in important places, like vmexit.flat's measuring
loop, yet it is ridiculously slow as it talks to the firmware config
interface.

Speed it up by reading the value from memory.

Signed-off-by: Avi Kivity a...@redhat.com
---
 lib/x86/smp.c |4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/lib/x86/smp.c b/lib/x86/smp.c
index 8da614a..d41c332 100644
--- a/lib/x86/smp.c
+++ b/lib/x86/smp.c
@@ -78,7 +78,7 @@ void spin_unlock(struct spinlock *lock)
 
 int cpu_count(void)
 {
-return fwcfg_get_nb_cpus();
+return _cpu_count;
 }
 
 int smp_id(void)
@@ -130,6 +130,8 @@ void smp_init(void)
 int i;
 void ipi_entry(void);
 
+_cpu_count = fwcfg_get_nb_cpus();
+
 set_ipi_descriptor(ipi_entry);
 
 setup_smp_id(0);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/3] smp: fix race in async on_cpu()

We fire off the IPI, but don't wait for the other cpu to pickk up
the function and data before returning.

Fix by making the other cpu ACK the receipt of the IPI (but still
execute the result asynchrously).

Signed-off-by: Avi Kivity a...@redhat.com
---
 lib/x86/smp.c |   29 -
 1 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/lib/x86/smp.c b/lib/x86/smp.c
index 241f755..8da614a 100644
--- a/lib/x86/smp.c
+++ b/lib/x86/smp.c
@@ -7,15 +7,27 @@
 #define IPI_VECTOR 0x20
 
 static struct spinlock ipi_lock;
-static void (*ipi_function)(void *data);
-static void *ipi_data;
+static volatile void (*ipi_function)(void *data);
+static volatile void *ipi_data;
 static volatile int ipi_done;
+static volatile bool ipi_wait;
+static int _cpu_count;
 
 static __attribute__((used)) void ipi()
 {
-ipi_function(ipi_data);
-apic_write(APIC_EOI, 0);
-ipi_done = 1;
+void (*function)(void *data) = ipi_function;
+void *data = ipi_data;
+bool wait = ipi_wait;
+
+if (!wait) {
+   ipi_done = 1;
+   apic_write(APIC_EOI, 0);
+}
+function(data);
+if (wait) {
+   ipi_done = 1;
+   apic_write(APIC_EOI, 0);
+}
 }
 
 asm (
@@ -92,13 +104,12 @@ static void __on_cpu(int cpu, void (*function)(void 
*data), void *data,
ipi_done = 0;
ipi_function = function;
ipi_data = data;
+   ipi_wait = wait;
apic_icr_write(APIC_INT_ASSERT | APIC_DEST_PHYSICAL | APIC_DM_FIXED
| IPI_VECTOR,
cpu);
-   if (wait) {
-   while (!ipi_done)
-   ;
-   }
+   while (!ipi_done)
+   ;
 }
 spin_unlock(ipi_lock);
 }
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/3] vmexit: fix race in joining smp tests

'nr_cpus_done' is not incremented atomically; this has been observed to
cause tests to stall.  Fix by using a proper atomic increment.

Signed-off-by: Avi Kivity a...@redhat.com
---
 x86/vmexit.c |9 +
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/x86/vmexit.c b/x86/vmexit.c
index 875caa3..ad8ab55 100644
--- a/x86/vmexit.c
+++ b/x86/vmexit.c
@@ -2,6 +2,7 @@
 #include libcflat.h
 #include smp.h
 #include processor.h
+#include atomic.h
 
 static unsigned int inl(unsigned short port)
 {
@@ -121,7 +122,7 @@ static struct test {
 };
 
 unsigned iterations;
-volatile int nr_cpus_done;
+static atomic_t nr_cpus_done;
 
 static void run_test(void *_func)
 {
@@ -131,7 +132,7 @@ static void run_test(void *_func)
 for (i = 0; i  iterations; ++i)
 func();
 
-nr_cpus_done++;
+atomic_inc(nr_cpus_done);
 }
 
 static void do_test(struct test *test)
@@ -155,10 +156,10 @@ static void do_test(struct test *test)
for (i = 0; i  iterations; ++i)
func();
} else {
-   nr_cpus_done = 0;
+   atomic_set(nr_cpus_done, 0);
for (i = cpu_count(); i  0; i--)
on_cpu_async(i-1, run_test, func);
-   while (nr_cpus_done  cpu_count())
+   while (atomic_read(nr_cpus_done)  cpu_count())
;
}
t2 = rdtsc();
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow

On 01/06/2011 12:25 PM, Nikola Ciprich wrote:

Did you get 'overrun: something' reports from trace-cmd, where something
!= 0?
nope, all entries were 0.

Dan, is there a way to hijack the monitor so we can run some commands on
it? Things like 'info registers' and disassembly.
AFAIK that's intentionally not possible :(
pity..

Looks like vcpu 1 is spinning; perhaps that's normal. If you get hold
of the monitor, please disassemble around 0xf80001575d59.

Thanks, a test machine that we can do horrible things to would be best.

You can try applying

http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git;a=commit;h=46fdb0937f26124700fc9fc80da4776330cc00d3
and see if it helps.
I'll also prepare testing kernel including this patch..
BTW Is it just me, or is git.kernel.org pretty slow today?

2.6.37 is out, perhaps people are all over it.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow

2011-01-06 Thread Daniel P. Berrange

On Thu, Jan 06, 2011 at 12:19:21PM +0200, Avi Kivity wrote:
 On 01/06/2011 11:42 AM, Nikola Ciprich wrote:
   - run trace-cmd record -e kvm -b 10 -P pid1 -P pid2, ctrl-C after a
 seems like it's not possible to specify multiple pids, so
 
 Did you get 'overrun: something' reports from trace-cmd, where
 something != 0?
 
 If you're not sure, please run the trace again.  Also try adding '-r
 10' to the command line.
 
 I've run 4 commands in parallel. Also I can't get monitor information
 since vm is started using libvirt, so I've just used all machine's qemu-kvm
 pids..
 
 Dan, is there a way to hijack the monitor so we can run some
 commands on it?  Things like 'info registers' and disassembly.

Depends on the libvirt version. For most, you'll need to
look for the monitor path in the QEMU argv:

  -chardev
+socket,id=monitor,path=/var/lib/libvirt/qemu/vmwts02.monitor,server,nowait 
-mon   chardev=monitor,mode=readline

then, 'service libvirtd stop' and now you can connect to
the monitor at that path  run commands you want, and then
disconnect and start libvirtd again. If you run any commands
that change the VM state, things may well get confused when
you start libvirtd again, but if its just 'info registers'
etc it should be pretty safe.

If you have a new enough libvirt, then you can also send
commands directly using 'virsh qemu-monitor-command' (checking
whether you need JSON or HMP syntax first - in this case you
can see it needs HMP).

Regards,
Daniel
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: FIXED: Re: [Qemu-devel] possible regression in qemu-kvm 0.13.0 (memtest)

2011-01-06 Thread Stefan Hajnoczi

On Wed, Jan 5, 2011 at 5:01 PM, Serge E. Hallyn se...@hallyn.com wrote:
 I don't see this patch in the git tree, nor a revert of the buggy
 commit.  Was any decision made on this?

Blue Swirl posted a patch a few days ago:
[PATCH] pc: move port 92 stuff back to pc.c from pckbd.c

It hasn't been merged yet but I don't see any objections to it on the
email thread.  Perhaps he's just busy.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR

2011-01-06 Thread Zachary Amsden


On 01/06/2011 12:34 AM, Alexander Graf wrote:

Am 06.01.2011 um 11:10 schrieb Zachary Amsdenzams...@redhat.com:

   

Use an MSR to allow soft migration to hosts which do not support
TSC trapping.  Rather than make this a required element of any
migration protocol, we allow the TSC rate to be exported as a data
field (useful in its own right), but we also allow a one time write
of the MSR during VM creation.  The result is that for the common
use case, no protocol change is required to communicate TSC rate
to the receiving host.

This allows administrative tools to configure migration policy
as they see appropriate.  Rather than dictate this policy with the
KVM implementation, we properly allow migration to hosts which both
do and do not support setting of the TSC rate on the receiving end.
If it is wished to not support migration to a host which lacks
support for the TSC rate feature, that can be coordinated externally.
 

Isn't there a real hw equivalent of such a register? It might make more sense 
to just implement that then.

   


Unfortunately, no.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX

2011-01-06 Thread Zachary Amsden


On 01/06/2011 12:41 AM, Alexander Graf wrote:

Am 06.01.2011 um 11:10 schrieb Zachary Amsdenzams...@redhat.com:

   

Reasons to trap the TSC are numerous, but we want to avoid it as much
as possible for performance reasons.

We provide two conservative modes via modules parameters and userspace
hinting.  First, the module can be loaded with tsc_auto=1 as a module
parameter, which turns on conservative TSC trapping only when it is
required (when unstable TSC or faster KHZ CPU is detected).

For userspace hinting, we enable trapping only if necessary.  Userspace
can hint that a VM needs a fixed frequency TSC, and also that SMP
stability will be required.  In that case, we conservatively turn on
trapping when it is needed.  In addition, users may now specify the
desired TSC rate at which to run.  If this rate differs significantly
from the host rate, trapping will be enabled.

There is also an override control to allow TSC trapping to be turned on
or off unconditionally for testing.

We indicate to pvclock users that the TSC is being trapped, to allow
avoiding overhead and directly using RDTSCP (only for SVM).  This
optimization is not yet implemented.
 

When migrating, the implementation could switch from non-trapped to trapped, 
making it less attractive. The guest however does not get notified about this 
change. Same for the other way around.
   


That's a policy decision to be made by the userspace agent.  It's better 
than the current situation, where there is no control at all of TSC 
rate.  Here, we're flexible either way.


Also note, moving to a faster processor, trapping kicks in... but the 
processor is faster, so no actual loss is noticed, and the problem 
corrects when the VM is power cycled.



Would it make sense to add a kvmclock interrupt to notify the guest of such a 
change?


kvmclock is immune to frequency changes, so it needs no interrupt, it 
just has a version controlled shared area, which is reset.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Flow Control and Port Mirroring Revisited

On Thu, Jan 06, 2011 at 12:27:55PM +0200, Michael S. Tsirkin wrote:
 On Thu, Jan 06, 2011 at 06:33:12PM +0900, Simon Horman wrote:
  Hi,
  
  Back in October I reported that I noticed a problem whereby flow control
  breaks down when openvswitch is configured to mirror a port[1].
 
 Apropos the UDP flow control.  See this
 http://www.spinics.net/lists/netdev/msg150806.html
 for some problems it introduces.
 Unfortunately UDP does not have built-in flow control.
 At some level it's just conceptually broken:
 it's not present in physical networks so why should
 we try and emulate it in a virtual network?
 
 
 Specifically, when you do:
 # netperf -c -4 -t UDP_STREAM -H 172.17.60.218 -l 30 -- -m 1472
 You are asking: what happens if I push data faster than it can be received?
 But why is this an interesting question?
 Ask 'what is the maximum rate at which I can send data with %X packet
 loss' or 'what is the packet loss at rate Y Gb/s'. netperf has
 -b and -w flags for this. It needs to be configured
 with --enable-intervals=yes for them to work.
 
 If you pose the questions this way the problem of pacing
 the execution just goes away.

I am aware that UDP inherently lacks flow control.

The aspect of flow control that I am interested in is situations where the
guest can create large amounts of work for the host. However, it seems that
in the case of virtio with vhostnet that the CPU utilisation seems to be
almost entirely attributable to the vhost and qemu-system processes.  And
in the case of virtio without vhost net the CPU is used by the qemu-system
process. In both case I assume that I could use a cgroup or something
similar to limit the guests.

Assuming all of that is true then from a resource control problem point of
view, which is mostly what I am concerned about, the problem goes away.
However, I still think that it would be nice to resolve the situation I
described.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX


On 01/06/2011 12:10 PM, Zachary Amsden wrote:

Reasons to trap the TSC are numerous, but we want to avoid it as much
as possible for performance reasons.

We provide two conservative modes via modules parameters and userspace
hinting.  First, the module can be loaded with tsc_auto=1 as a module
parameter, which turns on conservative TSC trapping only when it is
required (when unstable TSC or faster KHZ CPU is detected).

For userspace hinting, we enable trapping only if necessary.  Userspace
can hint that a VM needs a fixed frequency TSC, and also that SMP
stability will be required.  In that case, we conservatively turn on
trapping when it is needed.  In addition, users may now specify the
desired TSC rate at which to run.  If this rate differs significantly
from the host rate, trapping will be enabled.

There is also an override control to allow TSC trapping to be turned on
or off unconditionally for testing.

We indicate to pvclock users that the TSC is being trapped, to allow
avoiding overhead and directly using RDTSCP (only for SVM).  This
optimization is not yet implemented.

Signed-off-by: Zachary Amsdenzams...@redhat.com
---
  arch/x86/include/asm/kvm_host.h|6 +-
  arch/x86/include/asm/pvclock-abi.h |1 +
  arch/x86/kvm/svm.c |   20 ++
  arch/x86/kvm/vmx.c |   21 +++
  arch/x86/kvm/x86.c |  113 +---
  arch/x86/kvm/x86.h |2 +
  include/linux/kvm.h|   15 +
  7 files changed, 168 insertions(+), 10 deletions(-)



Haven't reviewed yet, but Documentation/kvm/api.txt is missing here.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [KVM TSC trapping / migration 1/2] Add TSC trapping for SVM and VMX


On 06.01.2011, at 12:30, Zachary Amsden wrote:

 On 01/06/2011 12:41 AM, Alexander Graf wrote:
 Am 06.01.2011 um 11:10 schrieb Zachary Amsdenzams...@redhat.com:
 
   
 Reasons to trap the TSC are numerous, but we want to avoid it as much
 as possible for performance reasons.
 
 We provide two conservative modes via modules parameters and userspace
 hinting.  First, the module can be loaded with tsc_auto=1 as a module
 parameter, which turns on conservative TSC trapping only when it is
 required (when unstable TSC or faster KHZ CPU is detected).
 
 For userspace hinting, we enable trapping only if necessary.  Userspace
 can hint that a VM needs a fixed frequency TSC, and also that SMP
 stability will be required.  In that case, we conservatively turn on
 trapping when it is needed.  In addition, users may now specify the
 desired TSC rate at which to run.  If this rate differs significantly
 from the host rate, trapping will be enabled.
 
 There is also an override control to allow TSC trapping to be turned on
 or off unconditionally for testing.
 
 We indicate to pvclock users that the TSC is being trapped, to allow
 avoiding overhead and directly using RDTSCP (only for SVM).  This
 optimization is not yet implemented.
 
 When migrating, the implementation could switch from non-trapped to trapped, 
 making it less attractive. The guest however does not get notified about 
 this change. Same for the other way around.
   
 
 That's a policy decision to be made by the userspace agent.  It's better than 
 the current situation, where there is no control at all of TSC rate.  Here, 
 we're flexible either way.
 
 Also note, moving to a faster processor, trapping kicks in... but the 
 processor is faster, so no actual loss is noticed, and the problem corrects 
 when the VM is power cycled.

Hrm. But even then the guest should be notified to enable it to act accordingly 
and just recalibrate instead of reboot, no? I'm not saying this is particularly 
interesting for kvmclock enabled guests, but think of all the  2.6.2x Linux, 
*BSD, Solaris, Windows etc. VMs out there that might have an easy means of 
triggering recalibration (or at least could introduce it), but writing a new 
clock source is a lot of work.

Of course, sending the notification through a userspace agent would also work. 
That one would have to be notified about the change too though.

 Would it make sense to add a kvmclock interrupt to notify the guest of such 
 a change?
 
 kvmclock is immune to frequency changes, so it needs no interrupt, it just 
 has a version controlled shared area, which is reset.


 We indicate to pvclock users that the TSC is being trapped, to allow
 avoiding overhead and directly using RDTSCP (only for SVM).  This
 optimization is not yet implemented.
 

That doesn't sound to me like they're unaffected?


Alex

 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [KVM TSC trapping / migration 2/2] Add TSC KHZ MSR


On 06.01.2011, at 12:27, Zachary Amsden wrote:

 On 01/06/2011 12:34 AM, Alexander Graf wrote:
 Am 06.01.2011 um 11:10 schrieb Zachary Amsdenzams...@redhat.com:
 
   
 Use an MSR to allow soft migration to hosts which do not support
 TSC trapping.  Rather than make this a required element of any
 migration protocol, we allow the TSC rate to be exported as a data
 field (useful in its own right), but we also allow a one time write
 of the MSR during VM creation.  The result is that for the common
 use case, no protocol change is required to communicate TSC rate
 to the receiving host.
 
 This allows administrative tools to configure migration policy
 as they see appropriate.  Rather than dictate this policy with the
 KVM implementation, we properly allow migration to hosts which both
 do and do not support setting of the TSC rate on the receiving end.
 If it is wished to not support migration to a host which lacks
 support for the TSC rate feature, that can be coordinated externally.
 
 Isn't there a real hw equivalent of such a register? It might make more 
 sense to just implement that then.
 
   
 
 Unfortunately, no.

Bleks. I couldn't find anything in AMD documentation either. Intel 
documentation is usually hard to find and incomplete anyways, so maybe 
something's hiding there - but if it's hidden so well it's no use to implement 
either.


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [KVM-AUTOTEST PATCH v2 3/6] [RFC] Introduce exception context strings

2011-01-06 Thread Michael Goldish

On 01/05/2011 06:21 PM, Avi Kivity wrote:
 On 01/05/2011 06:12 PM, Avi Kivity wrote:
 On 01/05/2011 05:45 PM, Michael Goldish wrote:
 In complex tests (KVM) an exception string is often not informative
 enough and
 the traceback and source code have to be examined in order to figure
 out what
 caused the exception.  Context strings are a way for tests to provide
 information about what they're doing, so that when an exception is
 raised, this
 information will be embedded in the exception string.  The result is
 a concise
 yet highly informative exception string, which should make it very
 easy to
 figure out where/when the exception was raised.

 A typical example for a test where this may be useful is KVM's reboot
 test.
 Some exceptions can be raised either before or after the VM is
 rebooted (e.g.
 logging into the guest can fail) and whether they are raised before
 or after
 is critical to the understanding of the failure.  Normally the
 traceback would
 have to be examined, but the proposed method makes it easy to know
 where the
 exception is raised without doing so.  To achieve this, the reboot
 test should
 place calls to error.context() as follows:

 error.context(before reboot)
 carry out pre-reboot actions
 error.context(sending reboot command)
 send the reboot command
 error.context(after reboot)
 carry out post-reboot actions

 If login fails in the pre-reboot section, the resulting exception
 string can
 can have something like context: before reboot embedded in it. 
 (The actual
 embedding is done in the next patch in the series.)

 It would be nice to make the error context a stack, and to use the
 with statement to manage the stack:


with error.context(main test):
foo()
with error.context(before reboot):
bar()

 If foo() throws an exception, the context would be main test, while
 if bar() throws an exception, the context would be before reboot in
 main test.

This seems like the best solution and it's unfortunate that we can't use it.

 btw, you can have a decorator for enclosing an entire function in an
 error context:
 
@function_error_context('migration test')
def migration_test(...):
...
 
 anything in migration_test() is enclosed in that context.  But we're
 just repeating the ordinary stack trace with something more readable.

The problem is that the string passed to function_error_context can't be
based on function parameters, so declaring a context like 'migrating
vm1' is impossible.

I do think we can benefit from 2 context levels per function though:

@context_aware
def migrate(...):
base_context(migrating %s % vm.name)
context(collecting parameters)
...
context(sending monitor command)
...
context(cleanup)
...

base_context() and context() will just be joined together using ' -- '
like regular contexts.  base_context() can be useful for long utility
functions.  Does this sound like a reasonable solution, or do you think
it's cleaner to always define a new nested function for each context level?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Flow Control and Port Mirroring Revisited

On Thu, Jan 06, 2011 at 08:30:52PM +0900, Simon Horman wrote:
 On Thu, Jan 06, 2011 at 12:27:55PM +0200, Michael S. Tsirkin wrote:
  On Thu, Jan 06, 2011 at 06:33:12PM +0900, Simon Horman wrote:
   Hi,
   
   Back in October I reported that I noticed a problem whereby flow control
   breaks down when openvswitch is configured to mirror a port[1].
  
  Apropos the UDP flow control.  See this
  http://www.spinics.net/lists/netdev/msg150806.html
  for some problems it introduces.
  Unfortunately UDP does not have built-in flow control.
  At some level it's just conceptually broken:
  it's not present in physical networks so why should
  we try and emulate it in a virtual network?
  
  
  Specifically, when you do:
  # netperf -c -4 -t UDP_STREAM -H 172.17.60.218 -l 30 -- -m 1472
  You are asking: what happens if I push data faster than it can be received?
  But why is this an interesting question?
  Ask 'what is the maximum rate at which I can send data with %X packet
  loss' or 'what is the packet loss at rate Y Gb/s'. netperf has
  -b and -w flags for this. It needs to be configured
  with --enable-intervals=yes for them to work.
  
  If you pose the questions this way the problem of pacing
  the execution just goes away.
 
 I am aware that UDP inherently lacks flow control.

Everyone's is aware of that, but this is always followed by a 'however'
:).

 The aspect of flow control that I am interested in is situations where the
 guest can create large amounts of work for the host. However, it seems that
 in the case of virtio with vhostnet that the CPU utilisation seems to be
 almost entirely attributable to the vhost and qemu-system processes.  And
 in the case of virtio without vhost net the CPU is used by the qemu-system
 process. In both case I assume that I could use a cgroup or something
 similar to limit the guests.

cgroups, yes. the vhost process inherits the cgroups
from the qemu process so you can limit them all.

If you are after limiting the max troughput of the guest
you can do this with cgroups as well.

 Assuming all of that is true then from a resource control problem point of
 view, which is mostly what I am concerned about, the problem goes away.
 However, I still think that it would be nice to resolve the situation I
 described.

We need to articulate what's wrong here, otherwise we won't
be able to resolve the situation. We are sending UDP packets
as fast as we can and some receivers can't cope. Is this the problem?
We have made attempts to add a pseudo flow control in the past
in an attempt to make UDP on the same host work better.
Maybe they help some but they also sure introduce problems.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow

OK, got test environment running, but it seems to be running much faster
there :(
but as dan suggested, I can type monitor commands using virsh, so I can
(carefully:)) continue debugging on this production machine..
here's info registers:
RAX=0007 RBX=00ac RCX=f880009d1015 
RDX=03ce
RSI=018a RDI=f8000163f737 RBP=0007 
RSP=f88002588b08
R8 =000f R9 =00ac R10=7b20 
R11=0008
R12=00a8 R13= R14=0012c690 
R15=001d52d0
RIP=f8000156ae48 RFL=0202 [---] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =002b   00c0f300 DPL=3 DS   [-WA]
CS =0010   00209b00 DPL=0 CS64 [-RA]
SS =0018   00c09300 DPL=0 DS   [-WA]
DS =002b   00c0f300 DPL=3 DS   [-WA]
FS =0053 fffe 3c00 0040f300 DPL=3 DS   [-WA]
GS =002b f80001644d00  00c0f300 DPL=3 DS   [-WA]
LDT=   
TR =0040 f80002767080 0067 8b00 DPL=0 TSS64-busy
GDT= f80002766000 007f
IDT= f80002766080 0fff
CR0=80050031 CR2=0047029a CR3=94925000 CR4=06f8
DR0= DR1= DR2= 
DR3=
DR6=0ff0 DR7=0400
EFER=0d01
FCW=027f FSW= [ST=0] FTW=00 MXCSR=1f80
FPR0=  FPR1= 
FPR2=  FPR3= 
FPR4=  FPR5= 
FPR6=  FPR7= 
XMM00=00288c3000a000a0 XMM01=
XMM02= XMM03=
XMM04= XMM05=
XMM06= XMM07=
XMM08= XMM09=
XMM10= XMM11=
XMM12= XMM13=
XMM14= XMM15=

 Looks like vcpu 1 is spinning; perhaps that's normal.  If you get hold  
 of the monitor, please disassemble around 0xf80001575d59.
ouch, can You advice me on how do I do it? :-[


 vcpu 0 is busy writing to vga (can you confirm)? looks like bank  
yes, seems like screen refreshing is quite slow, certainly in this rescue mode
or what it is, it's not using any acceleration...

 switching is hitting synchronize_srcu_expedited(), which is known slow.   
 Unfortunately that only gets better in 2.6.38.

 You can try applying  
 http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git;a=commit;h=46fdb0937f26124700fc9fc80da4776330cc00d3
  
I'll be able to test this only on testing machine, or on this production maybe 
overnight..
I'll prepare the kernel anyways..


 and see if it helps.

 -- 
 error compiling committee.c: too many arguments to function

 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html


-- 
-
Ing. Nikola CIPRICH
LinuxBox.cz, s.r.o.
28. rijna 168, 709 01 Ostrava

tel.:   +420 596 603 142
fax:+420 596 621 273
mobil:  +420 777 093 799
www.linuxbox.cz

mobil servis: +420 737 238 656
email servis: ser...@linuxbox.cz
-
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow

 If you have a new enough libvirt, then you can also send
 commands directly using 'virsh qemu-monitor-command' (checking
 whether you need JSON or HMP syntax first - in this case you
 can see it needs HMP).
Thanks Dan!
didn't know this is possible, works pretty well!
n.

 
 Regards,
 Daniel
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

-- 
-
Ing. Nikola CIPRICH
LinuxBox.cz, s.r.o.
28. rijna 168, 709 01 Ostrava

tel.:   +420 596 603 142
fax:+420 596 621 273
mobil:  +420 777 093 799
www.linuxbox.cz

mobil servis: +420 737 238 656
email servis: ser...@linuxbox.cz
-
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Flow Control and Port Mirroring Revisited

On Thu, Jan 06, 2011 at 02:07:22PM +0200, Michael S. Tsirkin wrote:
 On Thu, Jan 06, 2011 at 08:30:52PM +0900, Simon Horman wrote:
  On Thu, Jan 06, 2011 at 12:27:55PM +0200, Michael S. Tsirkin wrote:
   On Thu, Jan 06, 2011 at 06:33:12PM +0900, Simon Horman wrote:
Hi,

Back in October I reported that I noticed a problem whereby flow control
breaks down when openvswitch is configured to mirror a port[1].
   
   Apropos the UDP flow control.  See this
   http://www.spinics.net/lists/netdev/msg150806.html
   for some problems it introduces.
   Unfortunately UDP does not have built-in flow control.
   At some level it's just conceptually broken:
   it's not present in physical networks so why should
   we try and emulate it in a virtual network?
   
   
   Specifically, when you do:
   # netperf -c -4 -t UDP_STREAM -H 172.17.60.218 -l 30 -- -m 1472
   You are asking: what happens if I push data faster than it can be 
   received?
   But why is this an interesting question?
   Ask 'what is the maximum rate at which I can send data with %X packet
   loss' or 'what is the packet loss at rate Y Gb/s'. netperf has
   -b and -w flags for this. It needs to be configured
   with --enable-intervals=yes for them to work.
   
   If you pose the questions this way the problem of pacing
   the execution just goes away.
  
  I am aware that UDP inherently lacks flow control.
 
 Everyone's is aware of that, but this is always followed by a 'however'
 :).
 
  The aspect of flow control that I am interested in is situations where the
  guest can create large amounts of work for the host. However, it seems that
  in the case of virtio with vhostnet that the CPU utilisation seems to be
  almost entirely attributable to the vhost and qemu-system processes.  And
  in the case of virtio without vhost net the CPU is used by the qemu-system
  process. In both case I assume that I could use a cgroup or something
  similar to limit the guests.
 
 cgroups, yes. the vhost process inherits the cgroups
 from the qemu process so you can limit them all.
 
 If you are after limiting the max troughput of the guest
 you can do this with cgroups as well.

Do you mean a CPU cgroup or something else?

  Assuming all of that is true then from a resource control problem point of
  view, which is mostly what I am concerned about, the problem goes away.
  However, I still think that it would be nice to resolve the situation I
  described.
 
 We need to articulate what's wrong here, otherwise we won't
 be able to resolve the situation. We are sending UDP packets
 as fast as we can and some receivers can't cope. Is this the problem?
 We have made attempts to add a pseudo flow control in the past
 in an attempt to make UDP on the same host work better.
 Maybe they help some but they also sure introduce problems.

In the case where port mirroring is not active, which is the
usual case, to some extent there is flow control in place due to
(as Eric Dumazet pointed out) the socket buffer.

When port mirroring is activated the flow control operates based
only on one port - which can't be controlled by the administrator
in an obvious way.

I think that it would be more intuitive if flow control was
based on sending a packet to all ports rather than just one.

Though now I think about it some more, perhaps this isn't the best either.
For instance the case where data was being sent to dummy0 and suddenly
adding a mirror on eth1 slowed everything down.

So perhaps there needs to be another knob to tune when setting
up port-mirroring. Or perhaps the current situation isn't so bad.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Flow Control and Port Mirroring Revisited

On Thu, Jan 06, 2011 at 11:22:42AM +0100, Eric Dumazet wrote:
 Le jeudi 06 janvier 2011 à 18:33 +0900, Simon Horman a écrit :
  Hi,
  
  Back in October I reported that I noticed a problem whereby flow control
  breaks down when openvswitch is configured to mirror a port[1].
  
  I have (finally) looked into this further and the problem appears to relate
  to cloning of skbs, as Jesse Gross originally suspected.
  
  More specifically, in do_execute_actions[2] the first n-1 times that an skb
  needs to be transmitted it is cloned first and the final time the original
  skb is used.
  
  In the case that there is only one action, which is the normal case, then
  the original skb will be used. But in the case of mirroring the cloning
  comes into effect. And in my case the cloned skb seems to go to the (slow)
  eth1 interface while the original skb goes to the (fast) dummy0 interface
  that I set up to be a mirror. The result is that dummy0 paces the flow,
  and its a cracking pace at that.
  
  As an experiment I hacked do_execute_actions() to use the original skb
  for the first action instead of the last one.  In my case the result was
  that eth1 paces the flow, and things work reasonably nicely.
  
  Well, sort of. Things work well for non-GSO skbs but extremely poorly for
  GSO skbs where only 3 (yes 3, not 3%) end up at the remote host running
  netserv. I'm unsure why, but I digress.
  
  It seems to me that my hack illustrates the point that the flow ends up
  being paced by one interface. However I think that what would be
  desirable is that the flow is paced by the slowest link. Unfortunately
  I'm unsure how to achieve that.
  
 
 Hi Simon !
 
 pacing is done because skb is attached to a socket, and a socket has a
 limited (but configurable) sndbuf. sk-sk_wmem_alloc is the current sum
 of all truesize skbs in flight.
 
 When you enter something that :
 
 1) Get a clone of the skb, queue the clone to device X
 2) queue the original skb to device Y
 
 Then :Socket sndbuf is not affected at all by device X queue.
   This is speed on device Y that matters.
 
 You want to get servo control on both X and Y
 
 You could try to
 
 1) Get a clone of skb
Attach it to socket too (so that socket get a feedback of final
 orphaning for the clone) with skb_set_owner_w()
queue the clone to device X
 
 Unfortunatly, stacked skb-destructor() makes this possible only for
 known destructor (aka sock_wfree())

Hi Eric !

Thanks for the advice. I had thought about the socket buffer but at some
point it slipped my mind.

In any case the following patch seems to implement the change that I had in
mind. However my discussions Michael Tsirkin elsewhere in this thread are
beginning to make me think that think that perhaps this change isn't the
best solution.

diff --git a/datapath/actions.c b/datapath/actions.c
index 5e16143..505f13f 100644
--- a/datapath/actions.c
+++ b/datapath/actions.c
@@ -384,7 +384,12 @@ static int do_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
 
for (a = actions, rem = actions_len; rem  0; a = nla_next(a, rem)) {
if (prev_port != -1) {
-   do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port);
+   struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+   if (nskb) {
+   if (skb-sk)
+   skb_set_owner_w(nskb, skb-sk);
+   do_output(dp, nskb, prev_port);
+   }
prev_port = -1;
}

I got a rather nasty panic without the if (skb-sk),
I guess some skbs don't have a socket.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Flow Control and Port Mirroring Revisited

On Thu, Jan 06, 2011 at 09:29:02PM +0900, Simon Horman wrote:
 On Thu, Jan 06, 2011 at 02:07:22PM +0200, Michael S. Tsirkin wrote:
  On Thu, Jan 06, 2011 at 08:30:52PM +0900, Simon Horman wrote:
   On Thu, Jan 06, 2011 at 12:27:55PM +0200, Michael S. Tsirkin wrote:
On Thu, Jan 06, 2011 at 06:33:12PM +0900, Simon Horman wrote:
 Hi,
 
 Back in October I reported that I noticed a problem whereby flow 
 control
 breaks down when openvswitch is configured to mirror a port[1].

Apropos the UDP flow control.  See this
http://www.spinics.net/lists/netdev/msg150806.html
for some problems it introduces.
Unfortunately UDP does not have built-in flow control.
At some level it's just conceptually broken:
it's not present in physical networks so why should
we try and emulate it in a virtual network?


Specifically, when you do:
# netperf -c -4 -t UDP_STREAM -H 172.17.60.218 -l 30 -- -m 1472
You are asking: what happens if I push data faster than it can be 
received?
But why is this an interesting question?
Ask 'what is the maximum rate at which I can send data with %X packet
loss' or 'what is the packet loss at rate Y Gb/s'. netperf has
-b and -w flags for this. It needs to be configured
with --enable-intervals=yes for them to work.

If you pose the questions this way the problem of pacing
the execution just goes away.
   
   I am aware that UDP inherently lacks flow control.
  
  Everyone's is aware of that, but this is always followed by a 'however'
  :).
  
   The aspect of flow control that I am interested in is situations where the
   guest can create large amounts of work for the host. However, it seems 
   that
   in the case of virtio with vhostnet that the CPU utilisation seems to be
   almost entirely attributable to the vhost and qemu-system processes.  And
   in the case of virtio without vhost net the CPU is used by the qemu-system
   process. In both case I assume that I could use a cgroup or something
   similar to limit the guests.
  
  cgroups, yes. the vhost process inherits the cgroups
  from the qemu process so you can limit them all.
  
  If you are after limiting the max troughput of the guest
  you can do this with cgroups as well.
 
 Do you mean a CPU cgroup or something else?

net classifier cgroup

   Assuming all of that is true then from a resource control problem point of
   view, which is mostly what I am concerned about, the problem goes away.
   However, I still think that it would be nice to resolve the situation I
   described.
  
  We need to articulate what's wrong here, otherwise we won't
  be able to resolve the situation. We are sending UDP packets
  as fast as we can and some receivers can't cope. Is this the problem?
  We have made attempts to add a pseudo flow control in the past
  in an attempt to make UDP on the same host work better.
  Maybe they help some but they also sure introduce problems.
 
 In the case where port mirroring is not active, which is the
 usual case, to some extent there is flow control in place due to
 (as Eric Dumazet pointed out) the socket buffer.
 
 When port mirroring is activated the flow control operates based
 only on one port - which can't be controlled by the administrator
 in an obvious way.
 
 I think that it would be more intuitive if flow control was
 based on sending a packet to all ports rather than just one.
 
 Though now I think about it some more, perhaps this isn't the best either.
 For instance the case where data was being sent to dummy0 and suddenly
 adding a mirror on eth1 slowed everything down.
 
 So perhaps there needs to be another knob to tune when setting
 up port-mirroring. Or perhaps the current situation isn't so bad.

To understand whether it's bad, you'd need to measure it.
The netperf manual says:
5.2.4 UDP_STREAM

A UDP_STREAM test is similar to a TCP_STREAM test except UDP is 
used as
the transport rather than TCP.

A UDP_STREAM test has no end-to-end flow control - UDP provides 
none
and neither does netperf. However, if you wish, you can configure 
netperf with
--enable-intervals=yes to enable the global command-line -b and -w 
options to
pace bursts of traffic onto the network.

This has a number of implications.

...
and one of the implications is that the max throughput
might not be reached when you try to send as much data as possible.
It might be confusing that this is what netperf does by default with UDP_STREAM:
if the endpoint is much faster than the network the issue might not appear.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

cgroup limits only affect kvm guest under certain conditions

2011-01-06 Thread Dominik Klein

Hi

I am playing with cgroups and try to limit block io for guests.

The proof of concept is:

# mkdir /dev/cgroup/blkio
# mount -t cgroup -o blkio blkio /dev/cgroup/blkio/
# cd blkio/
# mkdir test
# cd test/
# ls -l /dev/vdisks/kirk
lrwxrwxrwx 1 root root 7 2011-01-06 13:46 /dev/vdisks/kirk - ../dm-5
# ls -l /dev/dm-5
brw-rw 1 root disk 253, 5 2011-01-06 13:36 /dev/dm-5
# echo 253:5  1048576  blkio.throttle.write_bps_device
# echo $$  tasks
# dd if=/dev/zero of=/dev/dm-5 bs=1M count=20
20+0 records in
20+0 records out
20971520 bytes (21 MB) copied, 20.0223 s, 1.0 MB/s

So limit applies to the dd child of my shell.

Now I assign /dev/dm-5 (/dev/vdisks/kirk) to a vm and echo the qemu-kvm
pid into tasks. Limits are not applied, the guest can happily use max io
bandwidth.

However, if I start the guest manually like

# qemu-kvm options like libvirt creates  echo $! 
/dev/cgroup/blkio/test/tasks

The limits _are_ applied.

So, this looks like some sort of race condition to me.

I tried to get information on this on the kernel mailing list [1], but
either noone read it or at least no one replied. Maybe someone here can
shed some light and maybe even fix the issue, if it is an issue.

Need more information? Please ask for it. I don't know what else to
supply at this point.

My current lab is built by:
OpenSuSE 11.3 64bit
Vanilla Kernel 2.6.37
libvirt 0.8.7
qemu-kvm 0.13.0
vm is started like this:
/usr/bin/qemu-kvm -M pc-0.12 -enable-kvm -m 2048 -smp
2,sockets=2,cores=1,threads=1 -name cliff -uuid
a8247e1e-e3d2-d0fc-c5e5-47a173c3e460 -nodefconfig -nodefaults -chardev
socket,id=monitor,path=/var/lib/libvirt/qemu/cliff.monitor,server,nowait
-mon chardev=monitor,mode=readline -rtc base=utc -boot c -device
lsi,id=scsi0,bus=pci.0,addr=0x6 -drive
file=/dev/vdisks/cliff,if=none,id=drive-virtio-disk0,boot=on,format=raw
-device
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0
-drive
file=/root/openSUSE-11.3-NET-x86_64.iso,if=none,media=cdrom,id=drive-ide0-1-0,readonly=on,format=raw
-device ide-drive,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0
-drive file=/dev/vdisks/jason,if=none,id=drive-virtio-disk1,format=raw
-device
virtio-blk-pci,bus=pci.0,addr=0x7,drive=drive-virtio-disk1,id=virtio-disk1
-drive file=/dev/vdisks/rob,if=none,id=drive-ide0-0-0,format=raw -device
ide-drive,bus=ide.0,unit=0,drive=drive-ide0-0-0,id=ide0-0-0 -drive
file=/dev/vdisks/james,if=none,id=drive-scsi0-0-0,format=raw -device
scsi-disk,bus=scsi0.0,scsi-id=0,drive=drive-scsi0-0-0,id=scsi0-0-0
-netdev tap,id=hostnet0 -device
virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:37:84:e0,bus=pci.0,addr=0x5
-usb -vnc 127.0.0.1:0 -vga cirrus -device
virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x3

Regards
Dominik

[1]
http://help.lockergnome.com/linux/race-condition-net_cls-found-qemu-kvm-environment--ftopict529787.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow


On 01/06/2011 02:18 PM, Nikola Ciprich wrote:

OK, got test environment running, but it seems to be running much faster
there :(


Same host kernel?


but as dan suggested, I can type monitor commands using virsh, so I can
(carefully:)) continue debugging on this production machine..
here's info registers:
RAX=0007 RBX=00ac RCX=f880009d1015 
RDX=03ce
RSI=018a RDI=f8000163f737 RBP=0007 
RSP=f88002588b08
R8 =000f R9 =00ac R10=7b20 
R11=0008
R12=00a8 R13= R14=0012c690 
R15=001d52d0
RIP=f8000156ae48 RFL=0202 [---] CPL=0 II=0 A20=1 SMM=0 HLT=0



That's cpu 0, which is busy writing stuff to the screen, not very 
interesting.



  Looks like vcpu 1 is spinning; perhaps that's normal.  If you get hold
  of the monitor, please disassemble around 0xf80001575d59.
ouch, can You advice me on how do I do it? :-[


(qemu) cpu 1
(qemu) info registers
(qemu) x/100i 0xf80001575d59 - 35



  vcpu 0 is busy writing to vga (can you confirm)? looks like bank
yes, seems like screen refreshing is quite slow, certainly in this rescue mode
or what it is, it's not using any acceleration...


It's actually decelerated by synchronized_srcu_expedited().  It's one 
area which got a large slowdown as the price for the great scalability 
we achieved with srcu.


But wait, this doesn't make sense.  If we see mmio to vga, then bank 
switching is not involved, yet I see huge latencies on writes to vga io 
ports.


Please install the qemu debuginfo package (if you built it yourself, I 
hope it was with debug symbols enabled) and run 'perf top'.



  switching is hitting synchronize_srcu_expedited(), which is known slow.
  Unfortunately that only gets better in 2.6.38.

  You can try applying
  
http://git.kernel.org/?p=linux/kernel/git/sfr/linux-next.git;a=commit;h=46fdb0937f26124700fc9fc80da4776330cc00d3
I'll be able to test this only on testing machine, or on this production maybe 
overnight..
I'll prepare the kernel anyways..



I'm no longer sure this is the problem.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Flow Control and Port Mirroring Revisited

2011-01-06 Thread Eric Dumazet

Le jeudi 06 janvier 2011 à 21:44 +0900, Simon Horman a écrit :

 Hi Eric !
 
 Thanks for the advice. I had thought about the socket buffer but at some
 point it slipped my mind.
 
 In any case the following patch seems to implement the change that I had in
 mind. However my discussions Michael Tsirkin elsewhere in this thread are
 beginning to make me think that think that perhaps this change isn't the
 best solution.
 
 diff --git a/datapath/actions.c b/datapath/actions.c
 index 5e16143..505f13f 100644
 --- a/datapath/actions.c
 +++ b/datapath/actions.c
 @@ -384,7 +384,12 @@ static int do_execute_actions(struct datapath *dp, 
 struct sk_buff *skb,
  
   for (a = actions, rem = actions_len; rem  0; a = nla_next(a, rem)) {
   if (prev_port != -1) {
 - do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port);
 + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
 + if (nskb) {
 + if (skb-sk)
 + skb_set_owner_w(nskb, skb-sk);
 + do_output(dp, nskb, prev_port);
 + }
   prev_port = -1;
   }
 
 I got a rather nasty panic without the if (skb-sk),
 I guess some skbs don't have a socket.

Indeed, some packets are not linked to a socket.

(ARP packets for example)

Sorry, I should have mentioned it :)


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Fwd: Re: [RFC -v3 PATCH 2/3] sched: add yield_to function]

2011-01-06 Thread Hillf Danton

On Wed, Jan 5, 2011 at 5:41 PM, Peter Zijlstra pet...@infradead.org wrote:
 On Wed, 2011-01-05 at 00:38 +0100, Tommaso Cucinotta wrote:
 Il 04/01/2011 19:15, Dario Faggioli ha scritto:

   Forwarded Message 
  From: Peter Zijlstraa.p.zijls...@chello.nl
  To: Rik van Rielr...@redhat.com
  Cc: Hillf Dantondhi...@gmail.com,kvm@vger.kernel.org,
  linux-ker...@vger.kernel.org, Avi Kivitia...@redhat.com, Srivatsa
  Vaddagiriva...@linux.vnet.ibm.com, Mike Galbraithefa...@gmx.de,
  Chris Wrightchr...@sous-sol.org
  Subject: Re: [RFC -v3 PATCH 2/3] sched: add yield_to function
  Date: Tue, 04 Jan 2011 19:05:54 +0100
  RT guests don't make sense, there's nowhere near enough infrastructure
  for that to work well.

  I'd argue that KVM running with RT priority is a bug.
 Peter, can I ask why did you state that ? In the IRMOS project, we
 are just deploying KVM VMs by using the Fabio's real-time scheduler
 (for others, a.k.a., the Fabio's EDF throttling patch, or IRMOS RT
 scheduler)
 in order to let the VMs get precise CPU scheduling guarantees by the
 kernel. So, in this context we do have KVM running at RT priority, and
 we do have experimental results showing how this can improve stability
 of performance of the hosted guest VMs.
 Of course, don't misunderstand me: this is a necessary condition for a
 stable performance of KVM VMs, I'm not saying it is sufficient for

 I was mostly referring to the existing RT cruft (SCHED_RR/FIFO), that's
 utterly useless for KVM.

 As to hosting vcpus with CBS this might maybe make sense, but RT-guests
 are still miles away. Anyway, I'm not quite sure how you would want to
 deal with the guest spinlock issue in CBS, ideally you'd use paravirt
 guests to avoid that whole problem.

 Anyway, /me goes do something useful, virt sucks and should be taken out
 back and shot in the head.

I dont think we are now still in the track of the patch from Rik, in
which Mike brought the yield_to method into scheduling.

The focus, as I see, is mainly on the effectiveness of the new method,
since it could also be utilized in other environments, though
currently it has nothing to do with the RT cruft but aims at easing
certain lock contention in KVM.

Another issue is that the change in the fair scheduling class,
accompanying the new method, is deserved, for any reason Rik hold.

Lets please return to the patch, and defer the RT.

thanks
Hillf
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [libvirt] cgroup limits only affect kvm guest under certain conditions

2011-01-06 Thread Daniel P. Berrange

On Thu, Jan 06, 2011 at 02:15:37PM +0100, Dominik Klein wrote:
 Hi
 
 I am playing with cgroups and try to limit block io for guests.
 
 The proof of concept is:
 
 # mkdir /dev/cgroup/blkio
 # mount -t cgroup -o blkio blkio /dev/cgroup/blkio/
 # cd blkio/
 # mkdir test
 # cd test/
 # ls -l /dev/vdisks/kirk
 lrwxrwxrwx 1 root root 7 2011-01-06 13:46 /dev/vdisks/kirk - ../dm-5
 # ls -l /dev/dm-5
 brw-rw 1 root disk 253, 5 2011-01-06 13:36 /dev/dm-5
 # echo 253:5  1048576  blkio.throttle.write_bps_device
 # echo $$  tasks
 # dd if=/dev/zero of=/dev/dm-5 bs=1M count=20
 20+0 records in
 20+0 records out
 20971520 bytes (21 MB) copied, 20.0223 s, 1.0 MB/s
 
 So limit applies to the dd child of my shell.
 
 Now I assign /dev/dm-5 (/dev/vdisks/kirk) to a vm and echo the qemu-kvm
 pid into tasks. Limits are not applied, the guest can happily use max io
 bandwidth.

Did you just echo the main qemu-kvm PID, or did you also
add the PIDs of every thread too ? From this description
of the problem, I'd guess you've only confined the main
process thread and thus the I/O  VCPU threads are not
confined.

Daniel
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] pci-assign: Fix transition MSI-INTx


On 01/04/2011 11:04 PM, Jan Kiszka wrote:

From: Jan Kiszkajan.kis...@siemens.com

Make sure to re-register the IRQ of an assigned device as INTx when the
guest disables MSI[X] mode again.

Signed-off-by: Jan Kiszkajan.kis...@siemens.com


Cc: More Reviewers


---
  hw/device-assignment.c |6 ++
  1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index f6410f8..fce7567 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -1192,7 +1192,10 @@ static void assigned_dev_update_msi(PCIDevice *pci_dev, 
unsigned int ctrl_pos)
  if (kvm_assign_irq(kvm_context,assigned_irq_data)  0)
  perror(assigned_dev_enable_msi: assign irq);

+assigned_dev-girq = -1;
  assigned_dev-irq_requested_type = assigned_irq_data.flags;
+} else {
+assign_irq(assigned_dev);
  }
  }
  #endif
@@ -1332,7 +1335,10 @@ static void assigned_dev_update_msix(PCIDevice *pci_dev, 
unsigned int ctrl_pos)
  perror(assigned_dev_enable_msix: assign irq);
  return;
  }
+assigned_dev-girq = -1;
  assigned_dev-irq_requested_type = assigned_irq_data.flags;
+} else {
+assign_irq(assigned_dev);
  }
  }
  #endif



--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [libvirt] cgroup limits only affect kvm guest under certain conditions

2011-01-06 Thread Dominik Klein

 Did you just echo the main qemu-kvm PID, or did you also
 add the PIDs of every thread too ? From this description
 of the problem, I'd guess you've only confined the main
 process thread and thus the I/O  VCPU threads are not
 confined.

That was indeed correct. I was mislead by the fact that no child
_processes_ were shown in ps.

Once I added /proc/qemu-pid/tasks/* to tasks, it worked even when
libvirt started the process.

Thank you very much!
Dominik
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] pci-assign: Fix transition MSI-INTx

2011-01-06 Thread Alex Williamson

On Thu, 2011-01-06 at 15:40 +0200, Avi Kivity wrote:
 On 01/04/2011 11:04 PM, Jan Kiszka wrote:
  From: Jan Kiszkajan.kis...@siemens.com
 
  Make sure to re-register the IRQ of an assigned device as INTx when the
  guest disables MSI[X] mode again.
 
  Signed-off-by: Jan Kiszkajan.kis...@siemens.com
 
 Cc: More Reviewers

Looks fine to me.

Acked-by: Alex Williamson alex.william...@redhat.com

  ---
hw/device-assignment.c |6 ++
1 files changed, 6 insertions(+), 0 deletions(-)
 
  diff --git a/hw/device-assignment.c b/hw/device-assignment.c
  index f6410f8..fce7567 100644
  --- a/hw/device-assignment.c
  +++ b/hw/device-assignment.c
  @@ -1192,7 +1192,10 @@ static void assigned_dev_update_msi(PCIDevice 
  *pci_dev, unsigned int ctrl_pos)
if (kvm_assign_irq(kvm_context,assigned_irq_data)  0)
perror(assigned_dev_enable_msi: assign irq);
 
  +assigned_dev-girq = -1;
assigned_dev-irq_requested_type = assigned_irq_data.flags;
  +} else {
  +assign_irq(assigned_dev);
}
}
#endif
  @@ -1332,7 +1335,10 @@ static void assigned_dev_update_msix(PCIDevice 
  *pci_dev, unsigned int ctrl_pos)
perror(assigned_dev_enable_msix: assign irq);
return;
}
  +assigned_dev-girq = -1;
assigned_dev-irq_requested_type = assigned_irq_data.flags;
  +} else {
  +assign_irq(assigned_dev);
}
}
#endif
 
 



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Host's write request is 4KB larger than that of the guest?

2011-01-06 Thread Duy Le (Dan)

Hi there,

I used a raw disk image to host a VM. I later used blktrace to capture
write requests issued from the guest to commit data to a physical disk
and found that the request size of the host is 8 blocks (4KB) larger
than that of the guest. Here is a part of the trace.

--- Guest
251,32   0  100 2.3352510  D   W 18747926 + 8 [kblockd/0]
251,32   0  101 2.3629710  D   W 18752462 + 24 [kblockd/0]
251,32   0  102 2.3836710  D   W 18756566 + 8 [kblockd/0]

--- Host
  8,16   0  155 2.415036111 32357  D   W 19146095 + 16 [kvm]
  8,16   0  156 2.415042667 32357  D   W 19150639 + 32 [kvm]
  8,16   0  158 2.415063867 32349  D   W 19154743 + 16 [kvm]

The content of that additional written sectors on the disk can either
be zero or something else. Please let me know if you have a clue or
tell me which file in the package should I be looking at closer to
understand this behavior.

I really appreciate that.

-- Dan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow

 Same host kernel?
yes, I also disabled KSM now, se below.

 (qemu) cpu 1
 (qemu) info registers

RAX= RBX= RCX=0002 
RDX=55a9
RSI=fa8003660450 RDI=0001 RBP=0080 
RSP=f880009f7cc0
R8 = R9 =0f44 R10=f8000145a000 
R11=
R12= R13=f800015cada0 R14= 
R15=f880009bcec0
RIP=f80001575d5b RFL=0202 [---] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =002b   00c0f300 DPL=3 DS   [-WA]
CS =0010   00209b00 DPL=0 CS64 [-RA]
SS =0018   00c09300 DPL=0 DS   [-WA]
DS =002b   00c0f300 DPL=3 DS   [-WA]
FS =0053 fffe 7c00 0040f300 DPL=3 DS   [-WA]
GS =002b f880009b8000  00c0f300 DPL=3 DS   [-WA]
LDT=   
TR =0040 f880009bcec0 0067 8b00 DPL=0 TSS64-busy
GDT= f880009c34c0 007f
IDT= f880009c3540 0fff
CR0=80050031 CR2=f8800121ca30 CR3=00187000 CR4=06f8
DR0= DR1= DR2= 
DR3= 
DR6=0ff0 DR7=0400
EFER=0d01
FCW=027f FSW=3800 [ST=7] FTW=80 MXCSR=1f80
FPR0=9fc0 4008 FPR1= 
FPR2=  FPR3= 
FPR4=  FPR5= 
FPR6=  FPR7= 
XMM00=99b6438668a3a8ed8cfd2d2540e9389a XMM01=
XMM02= XMM03=
XMM04= XMM05=
XMM06= XMM07=
XMM08= XMM09=
XMM10= XMM11=
XMM12= XMM13=
XMM14= XMM15=



 (qemu) x/100i 0xf80001575d59 - 35
virsh # qemu-monitor-command 2 x/100i 0xf80001575d59 - 35
0xf80001575d36:  mov%ebp,0x18(%rsp)
0xf80001575d3a:  push   %rsi
0xf80001575d3b:  push   %rdi
0xf80001575d3c:  push   %r12
0xf80001575d3e:  sub$0x30,%rsp
0xf80001575d42:  mov%cr8,%rbx
0xf80001575d46:  mov$0x2,%ecx
0xf80001575d4b:  cmp%cl,%bl
0xf80001575d4d:  ja 0xf80001575d67
0xf80001575d4f:  cmpq   $0x0,0x1598d1(%rip)# 0xf800016cf628
0xf80001575d57:  je 0xf80001575d5d
0xf80001575d59:  pause  
0xf80001575d5b:  jmp0xf80001575d4f
0xf80001575d5d:  mov%cr8,%rax
0xf80001575d61:  mov%rcx,%cr8
0xf80001575d65:  mov%al,%bl
0xf80001575d67:  mov%gs:0x20,%rdi
0xf80001575d70:  xor%r9d,%r9d
0xf80001575d73:  btl$0x10,0xe38c9(%rip)# 0xf80001659644
0xf80001575d7b:  jae0xf80001575d95
0xf80001575d7d:  mov$0x1,%sil
0xf80001575d80:  rdtsc  
0xf80001575d82:  mov0x4700(%rdi),%r12d
0xf80001575d89:  shl$0x20,%rdx
0xf80001575d8d:  or %rdx,%rax
0xf80001575d90:  mov%rax,%rbp
0xf80001575d93:  jmp0xf80001575da2
0xf80001575d95:  mov0x50(%rsp),%rbp
0xf80001575d9a:  mov0x50(%rsp),%r12d
0xf80001575d9f:  xor%sil,%sil
0xf80001575da2:  incl   0x4b00(%rdi)
0xf80001575da8:  lock btsq $0x0,0x159876(%rip)# 0xf800016cf628
0xf80001575db2:  jae0xf80001575dd1
0xf80001575db4:  lea0x15986d(%rip),%rcx# 0xf800016cf628
0xf80001575dbb:  callq  0xf8000148c1c0
0xf80001575dc0:  incl   0x4b04(%rdi)
0xf80001575dc6:  add%eax,0x4b08(%rdi)
0xf80001575dcc:  mov%eax,%r9d
0xf80001575dcf:  jmp0xf80001575dd4
0xf80001575dd1:  lfence 
0xf80001575dd4:  test   %sil,%sil
0xf80001575dd7:  je 0xf80001575e01
0xf80001575dd9:  rdtsc  
0xf80001575ddb:  shl$0x20,%rdx
0xf80001575ddf:  lea0x159842(%rip),%rcx# 0xf800016cf628
0xf80001575de6:  movb   $0x0,0x28(%rsp)
0xf80001575deb:  or %rdx,%rax
0xf80001575dee:  mov%r12d,0x20(%rsp)
0xf80001575df3:  mov%eax,%r8d
0xf80001575df6:  mov%rax,%rdx
0xf80001575df9:  sub%ebp,%r8d
0xf80001575dfc:  callq  0xf80001560f10
0xf80001575e01:  mov0x60(%rsp),%rbp
0xf80001575e06:  mov%bl,0x159818(%rip)# 0xf800016cf624
0xf80001575e0c:  mov0x58(%rsp),%rbx
0xf80001575e11:  add$0x30,%rsp
0xf80001575e15:  pop%r12
0xf80001575e17:  pop%rdi
0xf80001575e18:  pop%rsi
0xf80001575e19:  retq   
0xf80001575e1a:  nop
0xf80001575e1b:  nop
0xf80001575e1c:  nop
0xf80001575e1d:  nop
0xf80001575e1e:  nop

Re: [PATCH] pci-assign: Fix transition MSI-INTx

On Tue, Jan 04, 2011 at 10:04:30PM +0100, Jan Kiszka wrote:
 From: Jan Kiszka jan.kis...@siemens.com
 
 Make sure to re-register the IRQ of an assigned device as INTx when the
 guest disables MSI[X] mode again.
 
 Signed-off-by: Jan Kiszka jan.kis...@siemens.com

Acked-by: Michael S. Tsirkin m...@redhat.com

 ---
  hw/device-assignment.c |6 ++
  1 files changed, 6 insertions(+), 0 deletions(-)
 
 diff --git a/hw/device-assignment.c b/hw/device-assignment.c
 index f6410f8..fce7567 100644
 --- a/hw/device-assignment.c
 +++ b/hw/device-assignment.c
 @@ -1192,7 +1192,10 @@ static void assigned_dev_update_msi(PCIDevice 
 *pci_dev, unsigned int ctrl_pos)
  if (kvm_assign_irq(kvm_context, assigned_irq_data)  0)
  perror(assigned_dev_enable_msi: assign irq);
  
 +assigned_dev-girq = -1;
  assigned_dev-irq_requested_type = assigned_irq_data.flags;
 +} else {
 +assign_irq(assigned_dev);
  }
  }
  #endif
 @@ -1332,7 +1335,10 @@ static void assigned_dev_update_msix(PCIDevice 
 *pci_dev, unsigned int ctrl_pos)
  perror(assigned_dev_enable_msix: assign irq);
  return;
  }
 +assigned_dev-girq = -1;
  assigned_dev-irq_requested_type = assigned_irq_data.flags;
 +} else {
 +assign_irq(assigned_dev);
  }
  }
  #endif
 -- 
 1.7.1
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC -v3 PATCH 2/3] sched: add yield_to function

2011-01-06 Thread Hillf Danton

On Thu, Jan 6, 2011 at 12:57 AM, Mike Galbraith efa...@gmx.de wrote:
 sched: Add yield_to(task, preempt) functionality.

 Currently only implemented for fair class tasks.

 Add a yield_to_task method() to the fair scheduling class. allowing the
 caller of yield_to() to accelerate another thread in it's thread group,
 task group, and sched class toward either it's cpu, or potentially the
 caller's own cpu if the 'preempt' argument is also passed.

 Implemented via a scheduler hint, using cfs_rq-next to encourage the
 target being selected.

 Signed-off-by: Rik van Riel r...@redhat.com
 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
 Signed-off-by: Mike Galbraith efa...@gmx.de

 ---
  include/linux/sched.h |    1
  kernel/sched.c        |   56 
 ++
  kernel/sched_fair.c   |   52 ++
  3 files changed, 109 insertions(+)

 Index: linux-2.6/include/linux/sched.h
 ===
 --- linux-2.6.orig/include/linux/sched.h
 +++ linux-2.6/include/linux/sched.h
 @@ -1056,6 +1056,7 @@ struct sched_class {
        void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
        void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
        void (*yield_task) (struct rq *rq);
 +       int (*yield_to_task) (struct task_struct *p, int preempt);

        void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int 
 flags);

 Index: linux-2.6/kernel/sched.c
 ===
 --- linux-2.6.orig/kernel/sched.c
 +++ linux-2.6/kernel/sched.c
 @@ -5327,6 +5327,62 @@ void __sched yield(void)
  }
  EXPORT_SYMBOL(yield);

 +/**
 + * yield_to - yield the current processor to another thread in
 + * your thread group, or accelerate that thread toward the
 + * processor it's on.
 + *
 + * It's the caller's job to ensure that the target task struct
 + * can't go away on us before we can do any checks.
 + */
 +void __sched yield_to(struct task_struct *p, int preempt)
 +{
 +       struct task_struct *curr = current;
 +       struct rq *rq, *p_rq;
 +       unsigned long flags;
 +       int yield = 0;
 +
 +       local_irq_save(flags);
 +       rq = this_rq();
 +
 +again:
 +       p_rq = task_rq(p);
 +       double_rq_lock(rq, p_rq);
 +       while (task_rq(p) != p_rq) {
 +               double_rq_unlock(rq, p_rq);
 +               goto again;
 +       }
 +
 +       if (!curr-sched_class-yield_to_task)
 +               goto out;
 +
 +       if (curr-sched_class != p-sched_class)
 +               goto out;
 +

to be clearer?
        if (task_running(p_rq, p) || p-state != TASK_RUNNING)

 +               goto out;
 +
 +       if (!same_thread_group(p, curr))
 +               goto out;
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow


On 01/06/2011 03:55 PM, Nikola Ciprich wrote:

  Same host kernel?
yes, I also disabled KSM now, se below.

  (qemu) cpu 1
  (qemu) info registers

RAX= RBX= RCX=0002 
RDX=55a9
RSI=fa8003660450 RDI=0001 RBP=0080 
RSP=f880009f7cc0
R8 = R9 =0f44 R10=f8000145a000 
R11=
R12= R13=f800015cada0 R14= 
R15=f880009bcec0
RIP=f80001575d5b RFL=0202 [---] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =002b   00c0f300 DPL=3 DS   [-WA]
CS =0010   00209b00 DPL=0 CS64 [-RA]
SS =0018   00c09300 DPL=0 DS   [-WA]
DS =002b   00c0f300 DPL=3 DS   [-WA]
FS =0053 fffe 7c00 0040f300 DPL=3 DS   [-WA]
GS =002b f880009b8000  00c0f300 DPL=3 DS   [-WA]
LDT=   
TR =0040 f880009bcec0 0067 8b00 DPL=0 TSS64-busy
GDT= f880009c34c0 007f
IDT= f880009c3540 0fff
CR0=80050031 CR2=f8800121ca30 CR3=00187000 CR4=06f8
DR0= DR1= DR2= 
DR3=
DR6=0ff0 DR7=0400
EFER=0d01
FCW=027f FSW=3800 [ST=7] FTW=80 MXCSR=1f80
FPR0=9fc0 4008 FPR1= 
FPR2=  FPR3= 
FPR4=  FPR5= 
FPR6=  FPR7= 
XMM00=99b6438668a3a8ed8cfd2d2540e9389a XMM01=
XMM02= XMM03=
XMM04= XMM05=
XMM06= XMM07=
XMM08= XMM09=
XMM10= XMM11=
XMM12= XMM13=
XMM14= XMM15=



  (qemu) x/100i 0xf80001575d59 - 35
virsh # qemu-monitor-command 2 x/100i 0xf80001575d59 - 35
0xf80001575d36:  mov%ebp,0x18(%rsp)
0xf80001575d3a:  push   %rsi
0xf80001575d3b:  push   %rdi
0xf80001575d3c:  push   %r12
0xf80001575d3e:  sub$0x30,%rsp
0xf80001575d42:  mov%cr8,%rbx
0xf80001575d46:  mov$0x2,%ecx
0xf80001575d4b:  cmp%cl,%bl
0xf80001575d4d:  ja 0xf80001575d67
0xf80001575d4f:  cmpq   $0x0,0x1598d1(%rip)# 0xf800016cf628
0xf80001575d57:  je 0xf80001575d5d
0xf80001575d59:  pause


rip points here, reasonable for a spin loop, but what it's spinning on, 
I can't imagine.


Maybe some bug in kvm caused Windows to spin here endlessly and that's 
what's causing the problem, this is consistent with you not reproducing 
it on the test machine.



hence I disabled KSM.
here's perf top then, checkdisk is still slow:
   412.00  8.6% do_raw_spin_lock [kernel.kallsyms]


$ perf record -a -f -g
$ perf report -g

will show who calls do_raw_spin_lock.


   235.00  4.9% send_mono_rect   /usr/bin/qemu-kvm
   215.00  4.5% rb_next  [kernel.kallsyms]
   166.00  3.5% schedule [kernel.kallsyms]


What's the context switch rate? 'vmstat 1'


   141.00  3.0% add_preempt_count[kernel.kallsyms]
   137.00  2.9% gen_rotc_rm_T1   /usr/bin/qemu-kvm


Do you have a guest running with kvm disabled?!


   127.00  2.7% vmx_vcpu_run 
/lib/modules/2.6.36lb.03/kernel/arch/x86/kvm/kvm-intel.ko


That's guest time.  Puny.


   120.00  2.5% kvm_mmu_prepare_zap_page 
/lib/modules/2.6.36lb.03/kernel/arch/x86/kvm/kvm.ko


Could be a couple of things.



--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

GPU Pass-through need help.

2011-01-06 Thread Prasad Joshi

Hello All,

I am trying to pass-through a GPU PCIe ATI Radeon to VM. Here is a log on VM.

pra...@prasad-virtual-machine:~$ dmesg | grep -i -e drm -e radeon
[2.162294] [drm] Initialized drm 1.1.0 20060810
[2.459594] [drm] radeon defaulting to kernel modesetting.
[2.459596] [drm] radeon kernel modesetting enabled.
[2.766698] radeon :00:04.0: PCI INT A - Link[LNKD] - GSI 10
(level, high) - IRQ 10
[2.766734] radeon :00:04.0: setting latency timer to 64
[2.783512] [drm] initializing kernel modesetting (RV380 0x1002:0x5B64).
[2.792407] [drm] register mmio base: 0x4000
[2.792408] [drm] register mmio size: 65536
[2.797177] [drm] Generation 2 PCI interface, using max accessible memory
[2.797275] radeon :00:04.0: VRAM: 128M 0xF800 -
0x (128M used)
[2.797284] radeon :00:04.0: GTT: 512M 0xD800 -
0xF7FF
[2.798370] radeon :00:04.0: irq 40 for MSI/MSI-X
[2.870703] radeon :00:04.0: radeon: using MSI.
[3.151162] [drm] radeon: irq initialized.
[3.151539] [drm] Detected VRAM RAM=128M, BAR=128M
[3.151541] [drm] RAM width 128bits DDR
[3.151610] [drm] radeon: 128M of VRAM memory ready
[3.151611] [drm] radeon: 512M of GTT memory ready.
[3.151627] [drm] GART: num cpu pages 131072, num gpu pages 131072
[3.152701] [drm] radeon: 1 quad pipes, 1 Z pipes initialized.
[3.544943] [drm] PCIE GART of 512M enabled (table at 0xF804).
[3.548479] radeon :00:04.0: WB enabled
[3.549020] [drm] Loading R300 Microcode
[3.554278] [drm] radeon: ring at 0xD8001000
[3.778476] [drm:r100_ring_test] *ERROR* radeon: ring test failed
(sracth(0x15E4)=0xCAFEDEAD)
[3.781010] [drm:r100_cp_init] *ERROR* radeon: cp isn't working (-22).
[3.782542] radeon :00:04.0: failled initializing CP (-22).
[3.784006] radeon :00:04.0: Disabling GPU acceleration
[3.793709] [drm] radeon: cp finalized
[3.834875] radeon :00:04.0: 8800320b4c00 unpin not necessary
[3.841759] [drm] Radeon Display Connectors
[3.841766] [drm] Connector 0:
[3.841771] [drm]   VGA
[3.841777] [drm]   DDC: 0x60 0x60 0x60 0x60 0x60 0x60 0x60 0x60
[3.841781] [drm]   Encoders:
[3.841785] [drm] CRT1: INTERNAL_DAC1
[3.841788] [drm] Connector 1:
[3.841791] [drm]   DVI-I
[3.841794] [drm]   HPD1
[3.841799] [drm]   DDC: 0x64 0x64 0x64 0x64 0x64 0x64 0x64 0x64
[3.841803] [drm]   Encoders:
[3.841806] [drm] CRT2: INTERNAL_DAC2
[3.841809] [drm] DFP1: INTERNAL_TMDS1
[6.158753] [drm] fb mappable at 0xF804
[6.158756] [drm] vram apper at 0xF800
[6.158757] [drm] size 5242880
[6.158758] [drm] fb depth is 24
[6.158759] [drm]pitch is 5120
[   13.914196] fb0: radeondrmfb frame buffer device
[   13.914198] drm: registered panic notifier
[   13.914683] [drm] Initialized radeon 2.7.0 20080528 for
:00:04.0 on minor 0

I was previously seeing a problem with the ROM BIOS, but I could solve
it by passing a correct BIOS file in function assigned_initfn. For now
I am using hardcoded file name in the code.

Can anyone help me with debugging of following errors?
[3.778476] [drm:r100_ring_test] *ERROR* radeon: ring test failed
(sracth(0x15E4)=0xCAFEDEAD)
[3.781010] [drm:r100_cp_init] *ERROR* radeon: cp isn't working (-22).

3444 WREG32(scratch, 0xCAFEDEAD);
3445 r = radeon_ring_lock(rdev, 2);
3446 if (r) {
3447 DRM_ERROR(radeon: cp failed to lock ring (%d).\n, r);
3448 radeon_scratch_free(rdev, scratch);
3449 return r;
3450 }
3451 radeon_ring_write(rdev, PACKET0(scratch, 0));
3452 radeon_ring_write(rdev, 0xDEADBEEF);
3453 radeon_ring_unlock_commit(rdev);
3454 for (i = 0; i  rdev-usec_timeout; i++) {
3455 tmp = RREG32(scratch);
3456 if (tmp == 0xDEADBEEF) {
3457 break;
3458 }
3459 DRM_UDELAY(1);
3460 }
3461 if (i  rdev-usec_timeout) {
3462 DRM_INFO(ring test succeeded in %d usecs\n, i);
3463 } else {
3464 DRM_ERROR(radeon: ring test failed (sracth(0x%04X)=0x%08X)\n,
3465   scratch, tmp);
3466 r = -EINVAL;
3467 }

It seems like, on line 3452 code writes DEADBEEF ins some ring and on
line 3455-3456 it reads whatever has been written on ring and it fails
in this case as it reads back 0xCAFEDEAD

How can I debug this VM problem?

BTW, the assigned device is listed in lspci output

pra...@prasad-virtual-machine:~$ lspci  | grep ATI
00:04.0 Display controller: ATI Technologies Inc RV370 5B64 [FireGL
V3100 (PCIE)] (rev 80)

I also noticed following log on the host machine.
[ 6042.798880] kvm: 1566: cpu0 unimplemented perfctr wrmsr:
0xc0010004data 0xabcd
[ 6104.970165] pci-stub :02:00.0: restoring config space at
offset0x1 (was 0x100400, writing 0x17)
[ 6105.530290] assign device 0:2:0.0
[ 6105.530351] pci-stub :02:00.0: irq 88 for MSI/MSI-X
[

Re: Host's write request is 4KB larger than that of the guest?


On 01/06/2011 03:53 PM, Duy Le (Dan) wrote:

Hi there,

I used a raw disk image to host a VM. I later used blktrace to capture
write requests issued from the guest to commit data to a physical disk
and found that the request size of the host is 8 blocks (4KB) larger
than that of the guest. Here is a part of the trace.

--- Guest
251,32   0  100 2.3352510  D   W 18747926 + 8 [kblockd/0]
251,32   0  101 2.3629710  D   W 18752462 + 24 [kblockd/0]
251,32   0  102 2.3836710  D   W 18756566 + 8 [kblockd/0]

--- Host
   8,16   0  155 2.415036111 32357  D   W 19146095 + 16 [kvm]
   8,16   0  156 2.415042667 32357  D   W 19150639 + 32 [kvm]
   8,16   0  158 2.415063867 32349  D   W 19154743 + 16 [kvm]

The content of that additional written sectors on the disk can either
be zero or something else. Please let me know if you have a clue or
tell me which file in the package should I be looking at closer to
understand this behavior.


Most likely your guest partitions are not aligned to a 4k boundary; this 
causes the host to add padding to the nearest 4k, which is consistent 
with what you saw.  You can use fdisk to confirm this.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

no screen output for '-vga vmware' at boot time

2011-01-06 Thread Harald Dunkel

Hi folks,

Booting Debian Squeeze on the guest I get a line

Loading initrd...

the rest of the boot procedure is omitted. The initrd
message is not scrolled off the screen.

The guest seems to boot, though. Kdm is shown as usual.
If I switch back to /dev/tty1, then I finally see the
last few lines of the lost screen output.


kvm command line:

kvm -m 512 -drive file=/dev/storage/vdpcl006.vda.lv -vnc :0 -usbdevice tablet 
-vga vmware

Using -vga cirrus there is no such problem.


The problem seems to be related to grub2 and changing the
screen size at boot time. I have added these lines to the
grub configuration on the guest:

GRUB_GFXMODE=1024x768
GRUB_GFXPAYLOAD_LINUX=keep

If I omit these lines, then the guest boots as usual.
There is also no problem with 1024x768 if I wait 10 seconds
for grub and the screen size change before connecting the
vncviewer. Timing seems an issue here.

Changing the vnc viewer did not help. I tried xvnc4viewer
and the vnc client in virt-manager. xtightvncviewer was no
option, cause it dies on each change of the screen size.

qemu-kvm is version 0.12.5+dfsg-5, as found in Debian.
Kernel is 2.6.37 (host and guest). I could also reproduce
the problem using Debian's distro kernel for Testing.


Any helpful comment would be highly appreciated.



Regards

Harri
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: qemu-kvm-0.13.0 - winsows 2008 - chkdisk too slow


 $ perf record -a -f -g
 $ perf report -g
here we go:
here we go:
  - 49.72% _raw_spin_lock   
   ▒
 - 32.32% kvm_mmu_pte_write 
   ▒
- 98.02% emulator_write_phys
   ▒
 emulator_write_emulated_onepage
   ▒
 emulator_write_emulated
   ▒
 x86_emulate_insn   
   ▒
 emulate_instruction
   ▒
 kvm_mmu_page_fault 
   ▒
 handle_exception   
   ▒
 vmx_handle_exit
   ▒
 kvm_arch_vcpu_ioctl_run
   ▒
 kvm_vcpu_ioctl 
   ▒
 vfs_ioctl  
   ▒
 do_vfs_ioctl   
   ▒
 sys_ioctl  
   ▒
 system_call_fastpath   
   ▒
 __GI_ioctl 
   ▒
- 1.98% paging64_invlpg 
   ▒
 kvm_mmu_invlpg 
   ▒
 handle_invlpg  
   ▒
 vmx_handle_exit
   ▒
 kvm_arch_vcpu_ioctl_run
   ▒
 kvm_vcpu_ioctl 
   ▒
 vfs_ioctl  
   ▒
 do_vfs_ioctl   
   ▒
 sys_ioctl  
   ▒
 system_call_fastpath   
   ▒
 __GI_ioctl 
   ▒
 - 23.66% task_rq_lock  
   ▒
- try_to_wake_up
   ▒
   - 94.76% wake_up_process 
   ▒
cpu_stop_queue_work 
   ▒

[PATCH 0/2] vmx_vcpu_run micro-optimizations

A couple of minor optimizations to the vmx_vcpu_run assembly code.

Avi Kivity (2):
  KVM: VMX: Simplify saving guest rcx in vmx_vcpu_run
  KVM: VMX: Avoid atomic operation in vmx_vcpu_run

 arch/x86/kvm/vmx.c |   11 +++
 1 files changed, 7 insertions(+), 4 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/2] KVM: VMX: Simplify saving guest rcx in vmx_vcpu_run

Change

  push top-of-stack
  pop guest-rcx
  pop dummy

to

  pop guest-rcx

which is the same thing, only simpler.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/vmx.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bf89ec2..0d56fe0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4035,7 +4035,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
xchg %0, (%%Rsp) \n\t
mov %%Rax, %c[rax](%0) \n\t
mov %%Rbx, %c[rbx](%0) \n\t
-   pushQ (%%Rsp); popQ %c[rcx](%0) \n\t
+   popQ %c[rcx](%0) \n\t
mov %%Rdx, %c[rdx](%0) \n\t
mov %%Rsi, %c[rsi](%0) \n\t
mov %%Rdi, %c[rdi](%0) \n\t
@@ -4053,7 +4053,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
mov %%cr2, %%Rax   \n\t
mov %%Rax, %c[cr2](%0) \n\t
 
-   pop  %%Rbp; pop  %%Rbp; pop  %%Rdx \n\t
+   pop  %%Rbp; pop  %%Rdx \n\t
setbe %c[fail](%0) \n\t
  : : c(vmx), d((unsigned long)HOST_RSP),
[launched]i(offsetof(struct vcpu_vmx, launched)),
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] KVM: VMX: Avoid atomic operation in vmx_vcpu_run

Instead of exchanging the guest and host rcx, have separate storage
for each.  This allows us to avoid using the xchg instruction, which
is is a little slower than normal operations.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/vmx.c |7 +--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0d56fe0..d9fd4e6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3991,6 +3991,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
asm(
/* Store host registers */
push %%Rdx; push %%Rbp;
+   push %%Rcx \n\t /* placeholder for guest rcx */
push %%Rcx \n\t
cmp %%Rsp, %c[host_rsp](%0) \n\t
je 1f \n\t
@@ -4032,7 +4033,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
.Llaunched:  __ex(ASM_VMX_VMRESUME) \n\t
.Lkvm_vmx_return: 
/* Save guest registers, load host registers, keep flags */
-   xchg %0, (%%Rsp) \n\t
+   mov %0, %c[wordsize](%%Rsp) \n\t
+   pop %0 \n\t
mov %%Rax, %c[rax](%0) \n\t
mov %%Rbx, %c[rbx](%0) \n\t
popQ %c[rcx](%0) \n\t
@@ -4076,7 +4078,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
[r14]i(offsetof(struct vcpu_vmx, 
vcpu.arch.regs[VCPU_REGS_R14])),
[r15]i(offsetof(struct vcpu_vmx, 
vcpu.arch.regs[VCPU_REGS_R15])),
 #endif
-   [cr2]i(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
+   [cr2]i(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
+   [wordsize]i(sizeof(ulong))
  : cc, memory
, Rax, Rbx, Rdi, Rsi
 #ifdef CONFIG_X86_64
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[KVM-AUTOTEST PATCH 2/5] kvm_config: print directly to stdout instead of using logging

From: Eduardo Habkost ehabk...@raisama.net

If the whole purpose of running kvm_config.py directly is to print the
dictionary contents, it is better to simply dump the information to
stdout instead of adding the logginging info and timestamp clutter to
every single line.

Signed-off-by: Eduardo Habkost ehabk...@raisama.net
---
 client/tests/kvm/kvm_config.py |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/client/tests/kvm/kvm_config.py b/client/tests/kvm/kvm_config.py
index 45d8fe6..5be2e66 100755
--- a/client/tests/kvm/kvm_config.py
+++ b/client/tests/kvm/kvm_config.py
@@ -695,8 +695,8 @@ if __name__ == __main__:
 cfg.parse_file(fn)
 dicts = cfg.get_generator()
 for i, dict in enumerate(dicts):
-logging.info(Dictionary #%d:, i)
+print Dictionary #%d: % (i)
 keys = dict.keys()
 keys.sort()
 for key in keys:
-logging.info(%s = %s, key, dict[key])
+print %s = %s % (key, dict[key])
-- 
1.7.3.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[KVM-AUTOTEST PATCH 1/5] kvm_config: accept multiple filenames as argument

From: Eduardo Habkost ehabk...@raisama.net

Useful to test and debug cases where config settings are concatenated together,
without the need to change the base .cfg file.

Signed-off-by: Eduardo Habkost ehabk...@raisama.net
---
 client/tests/kvm/kvm_config.py |9 ++---
 1 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/client/tests/kvm/kvm_config.py b/client/tests/kvm/kvm_config.py
index 4fc1029..45d8fe6 100755
--- a/client/tests/kvm/kvm_config.py
+++ b/client/tests/kvm/kvm_config.py
@@ -682,15 +682,18 @@ if __name__ == __main__:
 options, args = parser.parse_args()
 debug = options.debug
 if args:
-filename = args[0]
+filenames = args
 else:
-filename = os.path.join(os.path.dirname(sys.argv[0]), tests.cfg)
+filenames = [os.path.join(os.path.dirname(sys.argv[0]), tests.cfg)]
 
 # Here we configure the stand alone program to use the autotest
 # logging system.
 logging_manager.configure_logging(kvm_utils.KvmLoggingConfig(),
   verbose=debug)
-dicts = config(filename, debug=debug).get_generator()
+cfg = config(debug=debug)
+for fn in filenames:
+cfg.parse_file(fn)
+dicts = cfg.get_generator()
 for i, dict in enumerate(dicts):
 logging.info(Dictionary #%d:, i)
 keys = dict.keys()
-- 
1.7.3.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[KVM-AUTOTEST PATCH 5/5] kvm_config: inform filename and line number on error message

From: Eduardo Habkost ehabk...@raisama.net

Include the filename and line number on the Using variants in this
context is not allowed exception error message.

Signed-off-by: Eduardo Habkost ehabk...@raisama.net
---
 client/tests/kvm/kvm_config.py |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/client/tests/kvm/kvm_config.py b/client/tests/kvm/kvm_config.py
index c206743..c4d9a01 100755
--- a/client/tests/kvm/kvm_config.py
+++ b/client/tests/kvm/kvm_config.py
@@ -297,7 +297,7 @@ class config:
 # (inside an exception or inside subvariants)
 if restricted:
 e_msg = Using variants in this context is not allowed
-raise error.AutotestError(e_msg)
+cr.raise_error(e_msg)
 if self.debug and not restricted:
 _debug_print(indented_line,
  Entering variants block (%d dicts in 
-- 
1.7.3.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[KVM-AUTOTEST PATCH 0/5] small kvm_config usability changes

From: Eduardo Habkost ehabk...@raisama.net

This series introduce some changes on kvm_config.py to make it more usable when
running it directly from the command-line.

Eduardo Habkost (5):
  kvm_config: accept multiple filenames as argument
  kvm_config: print directly to stdout instead of using logging
  kvm_config: store filename on configreader
  kvm_config: add helper to raise exception informing line number
  kvm_config: inform filename and line number on error message

 client/tests/kvm/kvm_config.py |   40 +---
 1 files changed, 29 insertions(+), 11 deletions(-)

-- 
1.7.3.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[KVM-AUTOTEST PATCH 3/5] kvm_config: store filename on configreader

From: Eduardo Habkost ehabk...@raisama.net

It will be useful to generate better error messages.

Signed-off-by: Eduardo Habkost ehabk...@raisama.net
---
 client/tests/kvm/kvm_config.py |9 +
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/client/tests/kvm/kvm_config.py b/client/tests/kvm/kvm_config.py
index 5be2e66..35e2ab9 100755
--- a/client/tests/kvm/kvm_config.py
+++ b/client/tests/kvm/kvm_config.py
@@ -48,7 +48,7 @@ class config:
 raise IOError(File %s not found % filename)
 self.filename = filename
 str = open(filename).read()
-self.list = self.parse(configreader(str), self.list)
+self.list = self.parse(configreader(filename, str), self.list)
 
 
 def parse_string(self, str):
@@ -57,7 +57,7 @@ class config:
 
 @param str: String to parse.
 
-self.list = self.parse(configreader(str), self.list)
+self.list = self.parse(configreader('string', str), self.list)
 
 
 def fork_and_parse(self, filename=None, str=None):
@@ -342,7 +342,7 @@ class config:
 words[1])
 if os.path.exists(filename):
 str = open(filename).read()
-list = self.parse(configreader(str), list, restricted)
+list = self.parse(configreader(filename, str), list, 
restricted)
 if self.debug and not restricted:
 _debug_print(, Leaving file %s % words[1])
 else:
@@ -539,12 +539,13 @@ class configreader:
 whose readline() and/or seek() methods seem to be slow.
 
 
-def __init__(self, str):
+def __init__(self, filename, str):
 
 Initialize the reader.
 
 @param str: The string to parse.
 
+self.filename = filename
 self.line_index = 0
 self.lines = []
 for line in str.splitlines():
-- 
1.7.3.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[KVM-AUTOTEST PATCH 4/5] kvm_config: add helper to raise exception informing line number

From: Eduardo Habkost ehabk...@raisama.net

Useful for syntax or other errors on the config file. We want to tell
the user on which file:line the error is located.

Signed-off-by: Eduardo Habkost ehabk...@raisama.net
---
 client/tests/kvm/kvm_config.py |   16 +++-
 1 files changed, 15 insertions(+), 1 deletions(-)

diff --git a/client/tests/kvm/kvm_config.py b/client/tests/kvm/kvm_config.py
index 35e2ab9..c206743 100755
--- a/client/tests/kvm/kvm_config.py
+++ b/client/tests/kvm/kvm_config.py
@@ -548,7 +548,8 @@ class configreader:
 self.filename = filename
 self.line_index = 0
 self.lines = []
-for line in str.splitlines():
+self.real_number = []
+for num,line in enumerate(str.splitlines(), 1):
 line = line.rstrip().expandtabs()
 stripped_line = line.strip()
 indent = len(line) - len(stripped_line)
@@ -557,6 +558,7 @@ class configreader:
 or stripped_line.startswith(//)):
 continue
 self.lines.append((line, stripped_line, indent))
+self.real_number.append(num)
 
 
 def get_next_line(self):
@@ -589,6 +591,18 @@ class configreader:
 
 self.line_index = index
 
+def raise_error(self, msg):
+Raise an error related to the last line returned by get_next_line()
+
+if self.line_index == 0: # nothing was read. shouldn't happen, but...
+line_id = 'BEGIN'
+elif self.line_index = len(self.lines): # past EOF
+line_id = 'EOF'
+else:
+# line_index is the _next_ line. get the previous one
+line_id = str(self.real_number[self.line_index-1])
+raise error.AutotestError(%s:%s: %s % (self.filename, line_id, msg))
+
 
 # Array structure:
 # 
-- 
1.7.3.2

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[KVM-AUTOTEST PATCH] kvm_config: don't store filename on config class

[Extra patch that may be added to the series I submitted previously]

A single 'config' object may parse multiple files, and most of the data
needed to parse a file is passed as argument to the parse() method,
except for the filename. As we're now tracking the filename on the
configreader object (that is created only when we start reading a file),
we can simply use this attribute instead of config.filename.

The config.filename attribute was only used when handling the 'include'
statement, so that's the only part of the code that needs to be adapted
to accommodate the change.

Signed-off-by: Eduardo Habkost ehabk...@redhat.com
---
 client/tests/kvm/kvm_config.py |   46 ---
 1 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/client/tests/kvm/kvm_config.py b/client/tests/kvm/kvm_config.py
index c4d9a01..f7c1e9f 100755
--- a/client/tests/kvm/kvm_config.py
+++ b/client/tests/kvm/kvm_config.py
@@ -32,7 +32,6 @@ class config:
 self.object_cache = []
 self.object_cache_indices = {}
 self.regex_cache = {}
-self.filename = filename
 self.debug = debug
 if filename:
 self.parse_file(filename)
@@ -46,7 +45,6 @@ class config:
 
 if not os.path.exists(filename):
 raise IOError(File %s not found % filename)
-self.filename = filename
 str = open(filename).read()
 self.list = self.parse(configreader(filename, str), self.list)
 
@@ -57,7 +55,7 @@ class config:
 
 @param str: String to parse.
 
-self.list = self.parse(configreader('string', str), self.list)
+self.list = self.parse(configreader('string', str, real_file=False), 
self.list)
 
 
 def fork_and_parse(self, filename=None, str=None):
@@ -337,20 +335,21 @@ class config:
 continue
 if self.debug and not restricted:
 _debug_print(indented_line, Entering file %s % words[1])
-if self.filename:
-filename = os.path.join(os.path.dirname(self.filename),
-words[1])
-if os.path.exists(filename):
-str = open(filename).read()
-list = self.parse(configreader(filename, str), list, 
restricted)
-if self.debug and not restricted:
-_debug_print(, Leaving file %s % words[1])
-else:
-logging.warning(Cannot include %s -- file not found,
-filename)
-else:
-logging.warning(Cannot include %s because no file is 
-currently open, words[1])
+
+cur_filename = cr.real_filename()
+if cur_filename is None:
+cr.raise_error('include' is valid only when parsing a 
file)
+
+filename = os.path.join(os.path.dirname(cur_filename),
+words[1])
+if not os.path.exists(filename):
+cr.raise_error(Cannot include %s -- file not found % 
(filename))
+
+str = open(filename).read()
+list = self.parse(configreader(filename, str), list, 
restricted)
+if self.debug and not restricted:
+_debug_print(, Leaving file %s % words[1])
+
 continue
 
 # Parse multi-line exceptions
@@ -539,13 +538,16 @@ class configreader:
 whose readline() and/or seek() methods seem to be slow.
 
 
-def __init__(self, filename, str):
+def __init__(self, filename, str, real_file=True):
 
 Initialize the reader.
 
+@param filename: the filename we're parsing
 @param str: The string to parse.
+@param real_file: Indicates if filename represents a real file. 
Defaults to True.
 
 self.filename = filename
+self.is_real_file = real_file
 self.line_index = 0
 self.lines = []
 self.real_number = []
@@ -561,6 +563,14 @@ class configreader:
 self.real_number.append(num)
 
 
+def real_filename(self):
+Returns the filename we're reading, in case it is a real file
+
+@returns the filename we are parsing, or None in case we're not 
parsing a real file
+
+if self.is_real_file:
+return self.filename
+
 def get_next_line(self):
 
 Get the next non-empty, non-comment line in the string.
-- 
1.7.3.2

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: FIXED: Re: [Qemu-devel] possible regression in qemu-kvm 0.13.0 (memtest)

2011-01-06 Thread Serge E. Hallyn

Thanks, Stefan.  That patch actually doesn't compile for me, because
it leaves references in hw/pckbd.c to both ioport92_write and
ioport92_read, which it deletes from there.  Should ioport92_read
just be renamed to outport_read instead of delted, and the remaining
references changed to {input,output}_read?

thanks,
-serge

[patch for reference]

diff --git a/hw/pc.c b/hw/pc.c
index 18a4a9f..e63b397 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -411,11 +411,71 @@ void pc_cmos_init(ram_addr_t ram_size,
ram_addr_t above_4g_mem_size,
 qemu_register_reset(pc_cmos_init_late, arg);
 }

+/* port 92 stuff: could be split off */
+typedef struct Port92State {
+ISADevice dev;
+uint8_t outport;
+qemu_irq *a20_out;
+} Port92State;
+
+static void port92_write(void *opaque, uint32_t addr, uint32_t val)
+{
+Port92State *s = opaque;
+
+DPRINTF(port92: write 0x%02x\n, val);
+s-outport = val;
+qemu_set_irq(*s-a20_out, (val  1)  1);
+if (val  1) {
+qemu_system_reset_request();
+}
+}
+
+static uint32_t port92_read(void *opaque, uint32_t addr)
+{
+Port92State *s = opaque;
+uint32_t ret;
+
+ret = s-outport;
+DPRINTF(port92: read 0x%02x\n, ret);
+return ret;
+}
+
+static void port92_init(ISADevice *dev, qemu_irq *a20_out)
+{
+Port92State *s = DO_UPCAST(Port92State, dev, dev);
+
+s-a20_out = a20_out;
+}
+
+static int port92_initfn(ISADevice *dev)
+{
+Port92State *s = DO_UPCAST(Port92State, dev, dev);
+
+register_ioport_read(0x92, 1, 1, port92_read, s);
+register_ioport_write(0x92, 1, 1, port92_write, s);
+isa_init_ioport(dev, 0x92);
+return 0;
+}
+
+static ISADeviceInfo port92_info = {
+.qdev.name = port92,
+.qdev.size = sizeof(Port92State),
+.qdev.no_user  = 1,
+.init  = port92_initfn,
+};
+
+static void port92_register(void)
+{
+isa_qdev_register(port92_info);
+}
+device_init(port92_register)
+
 static void handle_a20_line_change(void *opaque, int irq, int level)
 {
 CPUState *cpu = opaque;

 /* XXX: send to all CPUs ? */
+/* XXX: add logic to handle multiple A20 line sources */
 cpu_x86_set_a20(cpu, level);
 }

@@ -1027,7 +1087,7 @@ void pc_basic_device_init(qemu_irq *isa_irq,
 PITState *pit;
 qemu_irq rtc_irq = NULL;
 qemu_irq *a20_line;
-ISADevice *i8042;
+ISADevice *i8042, *port92;
 qemu_irq *cpu_exit_irq;

 register_ioport_write(0x80, 1, 1, ioport80_write, NULL);
@@ -1061,10 +1121,12 @@ void pc_basic_device_init(qemu_irq *isa_irq,
 }
 }

-a20_line = qemu_allocate_irqs(handle_a20_line_change, first_cpu, 1);
+a20_line = qemu_allocate_irqs(handle_a20_line_change, first_cpu, 2);
 i8042 = isa_create_simple(i8042);
-i8042_setup_a20_line(i8042, a20_line);
+i8042_setup_a20_line(i8042, a20_line[0]);
 vmmouse_init(i8042);
+port92 = isa_create_simple(port92);
+port92_init(port92, a20_line[1]);

 cpu_exit_irq = qemu_allocate_irqs(cpu_request_exit, NULL, 1);
 DMA_init(0, cpu_exit_irq);
diff --git a/hw/pckbd.c b/hw/pckbd.c
index 863b485..958de0a 100644
--- a/hw/pckbd.c
+++ b/hw/pckbd.c
@@ -211,10 +211,8 @@ static void kbd_queue(KBDState *s, int b, int aux)
 ps2_queue(s-kbd, b);
 }

-static void ioport92_write(void *opaque, uint32_t addr, uint32_t val)
+static void outport_write(KBDState *s, uint32_t addr, uint32_t val)
 {
-KBDState *s = opaque;
-
 DPRINTF(kbd: write outport=0x%02x\n, val);
 s-outport = val;
 if (s-a20_out) {
@@ -225,16 +223,6 @@ static void ioport92_write(void *opaque, uint32_t
addr, uint32_t val)
 }
 }

-static uint32_t ioport92_read(void *opaque, uint32_t addr)
-{
-KBDState *s = opaque;
-uint32_t ret;
-
-ret = s-outport;
-DPRINTF(kbd: read outport=0x%02x\n, ret);
-return ret;
-}
-
 static void kbd_write_command(void *opaque, uint32_t addr, uint32_t val)
 {
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] KVM: PPC: Fix SPRG get/set for Book3S and BookE

On Wed, Dec 29, 2010 at 01:51:25PM -0600, Peter Tyser wrote:
 Previously SPRGs 4-7 were improperly read and written in
 kvm_arch_vcpu_ioctl_get_regs() and kvm_arch_vcpu_ioctl_set_regs();
 
 Signed-off-by: Peter Tyser pty...@xes-inc.com
 ---
 I noticed this while grepping for somthing unrelated and assume its
 a typo.  Feel free to add to the patch description; I don't use KVM
 so don't know what the high-level consequences of this change are.
 
  arch/powerpc/kvm/book3s.c |   14 --
  arch/powerpc/kvm/booke.c  |   14 --
  2 files changed, 16 insertions(+), 12 deletions(-)

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 01/35] kvm: Enable user space NMI injection for kvm guest

From: Lai Jiangshan la...@cn.fujitsu.com

Make use of the new KVM_NMI IOCTL to send NMIs into the KVM guest if the
user space raised them. (example: qemu monitor's nmi command)

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
Acked-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 configure |3 +++
 target-i386/kvm.c |7 +++
 2 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/configure b/configure
index 47e4cf0..ec37a91 100755
--- a/configure
+++ b/configure
@@ -1674,6 +1674,9 @@ if test $kvm != no ; then
 #if !defined(KVM_CAP_DESTROY_MEMORY_REGION_WORKS)
 #error Missing KVM capability KVM_CAP_DESTROY_MEMORY_REGION_WORKS
 #endif
+#if !defined(KVM_CAP_USER_NMI)
+#error Missing KVM capability KVM_CAP_USER_NMI
+#endif
 int main(void) { return 0; }
 EOF
   if test $kerneldir !=  ; then
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 7dfc357..755f8c9 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1417,6 +1417,13 @@ int kvm_arch_get_registers(CPUState *env)
 
 int kvm_arch_pre_run(CPUState *env, struct kvm_run *run)
 {
+/* Inject NMI */
+if (env-interrupt_request  CPU_INTERRUPT_NMI) {
+env-interrupt_request = ~CPU_INTERRUPT_NMI;
+DPRINTF(injected NMI\n);
+kvm_vcpu_ioctl(env, KVM_NMI);
+}
+
 /* Try to inject an interrupt if the guest can accept it */
 if (run-ready_for_interrupt_injection 
 (env-interrupt_request  CPU_INTERRUPT_HARD) 
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 29/35] kvm: Drop smp_cpus argument from init functions

From: Jan Kiszka jan.kis...@siemens.com

No longer used.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 kvm-all.c  |4 ++--
 kvm-stub.c |2 +-
 kvm.h  |4 ++--
 target-i386/kvm.c  |2 +-
 target-ppc/kvm.c   |2 +-
 target-s390x/kvm.c |2 +-
 vl.c   |2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/kvm-all.c b/kvm-all.c
index d8820c7..190fcdf 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -584,7 +584,7 @@ static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
 .migration_log = kvm_client_migration_log,
 };
 
-int kvm_init(int smp_cpus)
+int kvm_init(void)
 {
 static const char upgrade_note[] =
 Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n
@@ -687,7 +687,7 @@ int kvm_init(int smp_cpus)
 kvm_state.xcrs = kvm_check_extension(KVM_CAP_XCRS);
 #endif
 
-ret = kvm_arch_init(smp_cpus);
+ret = kvm_arch_init();
 if (ret  0) {
 goto err;
 }
diff --git a/kvm-stub.c b/kvm-stub.c
index 3a058ad..e00d7df 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -58,7 +58,7 @@ int kvm_check_extension(unsigned int extension)
 return 0;
 }
 
-int kvm_init(int smp_cpus)
+int kvm_init(void)
 {
 return -ENOSYS;
 }
diff --git a/kvm.h b/kvm.h
index 26ca8c1..31d9f21 100644
--- a/kvm.h
+++ b/kvm.h
@@ -34,7 +34,7 @@ struct kvm_run;
 
 /* external API */
 
-int kvm_init(int smp_cpus);
+int kvm_init(void);
 
 int kvm_has_sync_mmu(void);
 int kvm_has_vcpu_events(void);
@@ -101,7 +101,7 @@ int kvm_arch_get_registers(CPUState *env);
 
 int kvm_arch_put_registers(CPUState *env, int level);
 
-int kvm_arch_init(int smp_cpus);
+int kvm_arch_init(void);
 
 int kvm_arch_init_vcpu(CPUState *env);
 
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 47cb22b..a907578 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -612,7 +612,7 @@ static int kvm_init_identity_map_page(void)
 return 0;
 }
 
-int kvm_arch_init(int smp_cpus)
+int kvm_arch_init(void)
 {
 int ret;
 struct utsname utsname;
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 56d30cc..72f2f94 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -56,7 +56,7 @@ static void kvm_kick_env(void *env)
 qemu_cpu_kick(env);
 }
 
-int kvm_arch_init(int smp_cpus)
+int kvm_arch_init(void)
 {
 #ifdef KVM_CAP_PPC_UNSET_IRQ
 cap_interrupt_unset = kvm_check_extension(KVM_CAP_PPC_UNSET_IRQ);
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index 927a37e..4f9075c 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -70,7 +70,7 @@
 #define SCLP_CMDW_READ_SCP_INFO 0x00020001
 #define SCLP_CMDW_READ_SCP_INFO_FORCED  0x00120001
 
-int kvm_arch_init(int smp_cpus)
+int kvm_arch_init(void)
 {
 return 0;
 }
diff --git a/vl.c b/vl.c
index b0b6605..fd47f4c 100644
--- a/vl.c
+++ b/vl.c
@@ -2837,7 +2837,7 @@ int main(int argc, char **argv, char **envp)
 }
 
 if (kvm_allowed) {
-int ret = kvm_init(smp_cpus);
+int ret = kvm_init();
 if (ret  0) {
 if (!kvm_available()) {
 printf(KVM not supported for this target\n);
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 32/35] kvm: Flush coalesced mmio buffer on IO window exits

From: Jan Kiszka jan.kis...@siemens.com

We must flush pending mmio writes if we leave kvm_cpu_exec for an IO
window. Otherwise we risk to loose those requests when migrating to a
different host during that window.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 kvm-all.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kvm-all.c b/kvm-all.c
index 7a5b299..a5e9246 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -851,6 +851,8 @@ void kvm_cpu_exec(CPUState *env)
 cpu_single_env = env;
 kvm_arch_post_run(env, run);
 
+kvm_flush_coalesced_mmio_buffer();
+
 if (ret == -EINTR || ret == -EAGAIN) {
 cpu_exit(env);
 DPRINTF(io window exit\n);
@@ -863,8 +865,6 @@ void kvm_cpu_exec(CPUState *env)
 abort();
 }
 
-kvm_flush_coalesced_mmio_buffer();
-
 ret = 0; /* exit loop */
 switch (run-exit_reason) {
 case KVM_EXIT_IO:
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 00/35] [PULL] qemu-kvm.git uq/master queue

The following changes since commit 23979dc5411befabe9049e37075b2b6320debc4e:

  microblaze: Use more TB chaining (2011-01-05 02:23:09 +0100)

are available in the git repository at:
  git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git uq/master

Jan Kiszka (27):
  kvm: x86: Fix DPL write back of segment registers
  kvm: x86: Remove obsolete SS.RPL/DPL aligment
  kvm: x86: Prevent sign extension of DR7 in guest debugging mode
  kvm: x86: Fix a few coding style violations
  kvm: Fix coding style violations
  kvm: Drop return value of kvm_cpu_exec
  kvm: Stop on all fatal exit reasons
  kvm: Improve reporting of fatal errors
  x86: Optionally dump code bytes on cpu_dump_state
  kvm: x86: Align kvm_arch_put_registers code with comment
  kvm: x86: Prepare kvm_get_mp_state for in-kernel irqchip
  kvm: x86: Remove redundant mp_state initialization
  kvm: x86: Fix xcr0 reset mismerge
  kvm: x86: Refactor msr_star/hsave_pa setup and checks
  kvm: x86: Reset paravirtual MSRs
  Synchronize VCPU states before reset
  kvm: x86: Drop MCE MSRs write back restrictions
  kvm: Eliminate KVMState arguments
  kvm: x86: Fix !CONFIG_KVM_PARA build
  kvm: x86: Introduce kvmclock device to save/restore its state
  kvm: Drop smp_cpus argument from init functions
  kvm: Consolidate must-have capability checks
  kvm: x86: Rework identity map and TSS setup for larger BIOS sizes
  kvm: Flush coalesced mmio buffer on IO window exits
  kvm: Do not use qemu_fair_mutex
  kvm: x86: Implicitly clear nmi_injected/pending on reset
  kvm: x86: Only read/write MSR_KVM_ASYNC_PF_EN if supported

Jin Dongming (6):
  Clean up cpu_inject_x86_mce()
  Add broadcast option for mce command
  Add function for checking mca broadcast of CPU
  kvm: introduce kvm_mce_in_progress
  kvm: kvm_mce_inj_* subroutines for templated error injections
  kvm: introduce kvm_inject_x86_mce_on

Lai Jiangshan (2):
  kvm: Enable user space NMI injection for kvm guest
  kvm: convert kvm_ioctl(KVM_CHECK_EXTENSION) to kvm_check_extension()

 configure |   36 ++-
 cpu-all.h |5 +-
 cpu-defs.h|2 -
 cpus.c|2 -
 hmp-commands.hx   |6 +-
 kvm-all.c |  447 -
 kvm-stub.c|8 +-
 kvm.h |   29 +-
 monitor.c |7 +-
 target-i386/cpu.h |9 +-
 target-i386/cpuid.c   |   14 +-
 target-i386/helper.c  |   97 +-
 target-i386/kvm.c |  882 +
 target-i386/kvm_x86.h |8 +-
 target-ppc/kvm.c  |   20 +-
 target-s390x/kvm.c|   12 +-
 vl.c  |3 +-
 17 files changed, 929 insertions(+), 658 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 15/35] kvm: Stop on all fatal exit reasons

From: Jan Kiszka jan.kis...@siemens.com

Ensure that we stop the guest whenever we face a fatal or unknown exit
reason. If we stop, we also have to enforce a cpu loop exit.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 kvm-all.c |   15 +++
 target-i386/kvm.c |4 
 target-ppc/kvm.c  |4 
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/kvm-all.c b/kvm-all.c
index 7518f2c..a46a3b6 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -774,7 +774,7 @@ static int kvm_handle_io(uint16_t port, void *data, int 
direction, int size,
 }
 
 #ifdef KVM_CAP_INTERNAL_ERROR_DATA
-static void kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
+static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
 {
 
 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
@@ -792,13 +792,13 @@ static void kvm_handle_internal_error(CPUState *env, 
struct kvm_run *run)
 if (run-internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
 fprintf(stderr, emulation failure\n);
 if (!kvm_arch_stop_on_emulation_error(env)) {
-return;
+return 0;
 }
 }
 /* FIXME: Should trigger a qmp message to let management know
  * something went wrong.
  */
-vm_stop(0);
+return -1;
 }
 #endif
 
@@ -926,16 +926,19 @@ void kvm_cpu_exec(CPUState *env)
 break;
 case KVM_EXIT_UNKNOWN:
 DPRINTF(kvm_exit_unknown\n);
+ret = -1;
 break;
 case KVM_EXIT_FAIL_ENTRY:
 DPRINTF(kvm_exit_fail_entry\n);
+ret = -1;
 break;
 case KVM_EXIT_EXCEPTION:
 DPRINTF(kvm_exit_exception\n);
+ret = -1;
 break;
 #ifdef KVM_CAP_INTERNAL_ERROR_DATA
 case KVM_EXIT_INTERNAL_ERROR:
-kvm_handle_internal_error(env, run);
+ret = kvm_handle_internal_error(env, run);
 break;
 #endif
 case KVM_EXIT_DEBUG:
@@ -956,6 +959,10 @@ void kvm_cpu_exec(CPUState *env)
 }
 } while (ret  0);
 
+if (ret  0) {
+vm_stop(0);
+env-exit_request = 1;
+}
 if (env-exit_request) {
 env-exit_request = 0;
 env-exception_index = EXCP_INTERRUPT;
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index fda07d2..2431a1f 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1534,6 +1534,10 @@ int kvm_arch_handle_exit(CPUState *env, struct kvm_run 
*run)
 DPRINTF(handle_hlt\n);
 ret = kvm_handle_halt(env);
 break;
+default:
+fprintf(stderr, KVM: unknown exit reason %d\n, run-exit_reason);
+ret = -1;
+break;
 }
 
 return ret;
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 5caa07c..849b404 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -307,6 +307,10 @@ int kvm_arch_handle_exit(CPUState *env, struct kvm_run 
*run)
 dprintf(handle halt\n);
 ret = kvmppc_handle_halt(env);
 break;
+default:
+fprintf(stderr, KVM: unknown exit reason %d\n, run-exit_reason);
+ret = -1;
+break;
 }
 
 return ret;
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 22/35] kvm: x86: Refactor msr_star/hsave_pa setup and checks

From: Jan Kiszka jan.kis...@siemens.com

Simplify kvm_has_msr_star/hsave_pa to booleans and push their one-time
initialization into kvm_arch_init. Also handle potential errors of that
setup procedure.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 target-i386/kvm.c |   47 +++
 1 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index e46b901..d8f26bf 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -54,6 +54,8 @@
 #define BUS_MCEERR_AO 5
 #endif
 
+static bool has_msr_star;
+static bool has_msr_hsave_pa;
 static int lm_capable_kernel;
 
 #ifdef KVM_CAP_EXT_CPUID
@@ -459,13 +461,10 @@ void kvm_arch_reset_vcpu(CPUState *env)
 }
 }
 
-int has_msr_star;
-int has_msr_hsave_pa;
-
-static void kvm_supported_msrs(CPUState *env)
+static int kvm_get_supported_msrs(KVMState *s)
 {
 static int kvm_supported_msrs;
-int ret;
+int ret = 0;
 
 /* first time */
 if (kvm_supported_msrs == 0) {
@@ -476,9 +475,9 @@ static void kvm_supported_msrs(CPUState *env)
 /* Obtain MSR list from KVM.  These are the MSRs that we must
  * save/restore */
 msr_list.nmsrs = 0;
-ret = kvm_ioctl(env-kvm_state, KVM_GET_MSR_INDEX_LIST, msr_list);
+ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, msr_list);
 if (ret  0  ret != -E2BIG) {
-return;
+return ret;
 }
 /* Old kernel modules had a bug and could write beyond the provided
memory. Allocate at least a safe amount of 1K. */
@@ -487,17 +486,17 @@ static void kvm_supported_msrs(CPUState *env)
   sizeof(msr_list.indices[0])));
 
 kvm_msr_list-nmsrs = msr_list.nmsrs;
-ret = kvm_ioctl(env-kvm_state, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
+ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
 if (ret = 0) {
 int i;
 
 for (i = 0; i  kvm_msr_list-nmsrs; i++) {
 if (kvm_msr_list-indices[i] == MSR_STAR) {
-has_msr_star = 1;
+has_msr_star = true;
 continue;
 }
 if (kvm_msr_list-indices[i] == MSR_VM_HSAVE_PA) {
-has_msr_hsave_pa = 1;
+has_msr_hsave_pa = true;
 continue;
 }
 }
@@ -506,19 +505,7 @@ static void kvm_supported_msrs(CPUState *env)
 free(kvm_msr_list);
 }
 
-return;
-}
-
-static int kvm_has_msr_hsave_pa(CPUState *env)
-{
-kvm_supported_msrs(env);
-return has_msr_hsave_pa;
-}
-
-static int kvm_has_msr_star(CPUState *env)
-{
-kvm_supported_msrs(env);
-return has_msr_star;
+return ret;
 }
 
 static int kvm_init_identity_map_page(KVMState *s)
@@ -543,9 +530,13 @@ static int kvm_init_identity_map_page(KVMState *s)
 int kvm_arch_init(KVMState *s, int smp_cpus)
 {
 int ret;
-
 struct utsname utsname;
 
+ret = kvm_get_supported_msrs(s);
+if (ret  0) {
+return ret;
+}
+
 uname(utsname);
 lm_capable_kernel = strcmp(utsname.machine, x86_64) == 0;
 
@@ -830,10 +821,10 @@ static int kvm_put_msrs(CPUState *env, int level)
 kvm_msr_entry_set(msrs[n++], MSR_IA32_SYSENTER_CS, env-sysenter_cs);
 kvm_msr_entry_set(msrs[n++], MSR_IA32_SYSENTER_ESP, env-sysenter_esp);
 kvm_msr_entry_set(msrs[n++], MSR_IA32_SYSENTER_EIP, env-sysenter_eip);
-if (kvm_has_msr_star(env)) {
+if (has_msr_star) {
 kvm_msr_entry_set(msrs[n++], MSR_STAR, env-star);
 }
-if (kvm_has_msr_hsave_pa(env)) {
+if (has_msr_hsave_pa) {
 kvm_msr_entry_set(msrs[n++], MSR_VM_HSAVE_PA, env-vm_hsave);
 }
 #ifdef TARGET_X86_64
@@ -1076,10 +1067,10 @@ static int kvm_get_msrs(CPUState *env)
 msrs[n++].index = MSR_IA32_SYSENTER_CS;
 msrs[n++].index = MSR_IA32_SYSENTER_ESP;
 msrs[n++].index = MSR_IA32_SYSENTER_EIP;
-if (kvm_has_msr_star(env)) {
+if (has_msr_star) {
 msrs[n++].index = MSR_STAR;
 }
-if (kvm_has_msr_hsave_pa(env)) {
+if (has_msr_hsave_pa) {
 msrs[n++].index = MSR_VM_HSAVE_PA;
 }
 msrs[n++].index = MSR_IA32_TSC;
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 19/35] kvm: x86: Prepare kvm_get_mp_state for in-kernel irqchip

From: Jan Kiszka jan.kis...@siemens.com

This code path will not yet be taken as we still lack in-kernel irqchip
support. But qemu-kvm can already make use of it and drop its own
mp_state access services.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 target-i386/kvm.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 684430f..30aa51c 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1198,6 +1198,9 @@ static int kvm_get_mp_state(CPUState *env)
 return ret;
 }
 env-mp_state = mp_state.mp_state;
+if (kvm_irqchip_in_kernel()) {
+env-halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
+}
 return 0;
 }
 
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 11/35] kvm: x86: Prevent sign extension of DR7 in guest debugging mode

From: Jan Kiszka jan.kis...@siemens.com

This unbreaks guest debugging when the 4th hardware breakpoint used for
guest debugging is a watchpoint of 4 or 8 byte lenght. The 31st bit of
DR7 is set in that case and used to cause a sign extension to the high
word which was breaking the guest state (vm entry failure).

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 target-i386/kvm.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 7e5982b..85edacc 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1686,7 +1686,7 @@ void kvm_arch_update_guest_debug(CPUState *env, struct 
kvm_guest_debug *dbg)
 dbg-arch.debugreg[n] = hw_breakpoint[n].addr;
 dbg-arch.debugreg[7] |= (2  (n * 2)) |
 (type_code[hw_breakpoint[n].type]  (16 + n*4)) |
-(len_code[hw_breakpoint[n].len]  (18 + n*4));
+((uint32_t)len_code[hw_breakpoint[n].len]  (18 + n*4));
 }
 }
 /* Legal xcr0 for loading */
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 03/35] Clean up cpu_inject_x86_mce()

From: Jin Dongming jin.dongm...@np.css.fujitsu.com

Clean up cpu_inject_x86_mce() for later patch.

Signed-off-by: Jin Dongming jin.dongm...@np.css.fujitsu.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 target-i386/helper.c |   27 +--
 1 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/target-i386/helper.c b/target-i386/helper.c
index 25a3e36..2c94130 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -1021,21 +1021,12 @@ static void breakpoint_handler(CPUState *env)
 /* This should come from sysemu.h - if we could include it here... */
 void qemu_system_reset_request(void);
 
-void cpu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+static void qemu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
 uint64_t mcg_status, uint64_t addr, uint64_t misc)
 {
 uint64_t mcg_cap = cenv-mcg_cap;
-unsigned bank_num = mcg_cap  0xff;
 uint64_t *banks = cenv-mce_banks;
 
-if (bank = bank_num || !(status  MCI_STATUS_VAL))
-return;
-
-if (kvm_enabled()) {
-kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0);
-return;
-}
-
 /*
  * if MSR_MCG_CTL is not all 1s, the uncorrected error
  * reporting is disabled
@@ -1076,6 +1067,22 @@ void cpu_inject_x86_mce(CPUState *cenv, int bank, 
uint64_t status,
 } else
 banks[1] |= MCI_STATUS_OVER;
 }
+
+void cpu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
+uint64_t mcg_status, uint64_t addr, uint64_t misc)
+{
+unsigned bank_num = cenv-mcg_cap  0xff;
+
+if (bank = bank_num || !(status  MCI_STATUS_VAL)) {
+return;
+}
+
+if (kvm_enabled()) {
+kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0);
+} else {
+qemu_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+}
+}
 #endif /* !CONFIG_USER_ONLY */
 
 static void mce_init(CPUX86State *cenv)
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 10/35] kvm: x86: Remove obsolete SS.RPL/DPL aligment

From: Jan Kiszka jan.kis...@siemens.com

This seems to date back to the days KVM didn't support real mode. The
check is no longer needed and, even worse, is corrupting the guest state
in case SS.RPL != DPL.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 target-i386/kvm.c |7 ---
 1 files changed, 0 insertions(+), 7 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index ee7bdf8..7e5982b 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -780,13 +780,6 @@ static int kvm_put_sregs(CPUState *env)
set_seg(sregs.fs, env-segs[R_FS]);
set_seg(sregs.gs, env-segs[R_GS]);
set_seg(sregs.ss, env-segs[R_SS]);
-
-   if (env-cr[0]  CR0_PE_MASK) {
-   /* force ss cpl to cs cpl */
-   sregs.ss.selector = (sregs.ss.selector  ~3) |
-   (sregs.cs.selector  3);
-   sregs.ss.dpl = sregs.ss.selector  3;
-   }
 }
 
 set_seg(sregs.tr, env-tr);
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 18/35] kvm: x86: Align kvm_arch_put_registers code with comment

From: Jan Kiszka jan.kis...@siemens.com

The ordering doesn't matter in this case, but better keep it consistent.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 target-i386/kvm.c |6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index d4f253e..684430f 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1388,12 +1388,12 @@ int kvm_arch_put_registers(CPUState *env, int level)
 if (ret  0) {
 return ret;
 }
-/* must be last */
-ret = kvm_guest_debug_workarounds(env);
+ret = kvm_put_debugregs(env);
 if (ret  0) {
 return ret;
 }
-ret = kvm_put_debugregs(env);
+/* must be last */
+ret = kvm_guest_debug_workarounds(env);
 if (ret  0) {
 return ret;
 }
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 33/35] kvm: Do not use qemu_fair_mutex

From: Jan Kiszka jan.kis...@siemens.com

The imbalance in the hold time of qemu_global_mutex only exists in TCG
mode. In contrast to TCG VCPUs, KVM drops the global lock during guest
execution. We already avoid touching the fairness lock from the
IO-thread in KVM mode, so also stop using it from the VCPU threads.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 cpus.c |2 --
 1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/cpus.c b/cpus.c
index 0309189..4c9928e 100644
--- a/cpus.c
+++ b/cpus.c
@@ -735,9 +735,7 @@ static sigset_t block_io_signals(void)
 void qemu_mutex_lock_iothread(void)
 {
 if (kvm_enabled()) {
-qemu_mutex_lock(qemu_fair_mutex);
 qemu_mutex_lock(qemu_global_mutex);
-qemu_mutex_unlock(qemu_fair_mutex);
 } else {
 qemu_mutex_lock(qemu_fair_mutex);
 if (qemu_mutex_trylock(qemu_global_mutex)) {
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 21/35] kvm: x86: Fix xcr0 reset mismerge

From: Jan Kiszka jan.kis...@siemens.com

For unknown reasons, xcr0 reset ended up in kvm_arch_update_guest_debug
on upstream merge. Fix this and also remove the misleading comment (1 is
THE reset value).

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 target-i386/kvm.c |3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 1403327..e46b901 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -450,6 +450,7 @@ void kvm_arch_reset_vcpu(CPUState *env)
 env-interrupt_injected = -1;
 env-nmi_injected = 0;
 env-nmi_pending = 0;
+env-xcr0 = 1;
 if (kvm_irqchip_in_kernel()) {
 env-mp_state = cpu_is_bsp(env) ? KVM_MP_STATE_RUNNABLE :
   KVM_MP_STATE_UNINITIALIZED;
@@ -1756,8 +1757,6 @@ void kvm_arch_update_guest_debug(CPUState *env, struct 
kvm_guest_debug *dbg)
 ((uint32_t)len_code[hw_breakpoint[n].len]  (18 + n*4));
 }
 }
-/* Legal xcr0 for loading */
-env-xcr0 = 1;
 }
 #endif /* KVM_CAP_SET_GUEST_DEBUG */
 
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 23/35] kvm: x86: Reset paravirtual MSRs

From: Jan Kiszka jan.kis...@siemens.com

Make sure to write the cleared MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
and MSR_KVM_ASYNC_PF_EN to the kernel state so that a freshly booted
guest cannot be disturbed by old values.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
CC: Glauber Costa glom...@redhat.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 target-i386/kvm.c |7 +++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index d8f26bf..8267655 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -845,6 +845,13 @@ static int kvm_put_msrs(CPUState *env, int level)
 if (smp_cpus == 1 || env-tsc != 0) {
 kvm_msr_entry_set(msrs[n++], MSR_IA32_TSC, env-tsc);
 }
+}
+/*
+ * The following paravirtual MSRs have side effects on the guest or are
+ * too heavy for normal writeback. Limit them to reset or full state
+ * updates.
+ */
+if (level = KVM_PUT_RESET_STATE) {
 kvm_msr_entry_set(msrs[n++], MSR_KVM_SYSTEM_TIME,
   env-system_time_msr);
 kvm_msr_entry_set(msrs[n++], MSR_KVM_WALL_CLOCK, env-wall_clock_msr);
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 17/35] x86: Optionally dump code bytes on cpu_dump_state

From: Jan Kiszka jan.kis...@siemens.com

Introduce the cpu_dump_state flag CPU_DUMP_CODE and implement it for
x86. This writes out the code bytes around the current instruction
pointer. Make use of this feature in KVM to help debugging fatal vm
exits.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 cpu-all.h|2 ++
 kvm-all.c|4 ++--
 target-i386/helper.c |   21 +
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/cpu-all.h b/cpu-all.h
index 4ce4e83..ffbd6a4 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -765,6 +765,8 @@ int page_check_range(target_ulong start, target_ulong len, 
int flags);
 CPUState *cpu_copy(CPUState *env);
 CPUState *qemu_get_cpu(int cpu);
 
+#define CPU_DUMP_CODE 0x0001
+
 void cpu_dump_state(CPUState *env, FILE *f, fprintf_function cpu_fprintf,
 int flags);
 void cpu_dump_statistics(CPUState *env, FILE *f, fprintf_function cpu_fprintf,
diff --git a/kvm-all.c b/kvm-all.c
index ad1d0a8..ef2ca3b 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -791,7 +791,7 @@ static int kvm_handle_internal_error(CPUState *env, struct 
kvm_run *run)
 if (run-internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
 fprintf(stderr, emulation failure\n);
 if (!kvm_arch_stop_on_emulation_error(env)) {
-cpu_dump_state(env, stderr, fprintf, 0);
+cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
 return 0;
 }
 }
@@ -953,7 +953,7 @@ void kvm_cpu_exec(CPUState *env)
 } while (ret  0);
 
 if (ret  0) {
-cpu_dump_state(env, stderr, fprintf, 0);
+cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
 vm_stop(0);
 env-exit_request = 1;
 }
diff --git a/target-i386/helper.c b/target-i386/helper.c
index 6dfa27d..af2ce10 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -249,6 +249,9 @@ done:
 cpu_fprintf(f, \n);
 }
 
+#define DUMP_CODE_BYTES_TOTAL50
+#define DUMP_CODE_BYTES_BACKWARD 20
+
 void cpu_dump_state(CPUState *env, FILE *f, fprintf_function cpu_fprintf,
 int flags)
 {
@@ -434,6 +437,24 @@ void cpu_dump_state(CPUState *env, FILE *f, 
fprintf_function cpu_fprintf,
 cpu_fprintf(f,  );
 }
 }
+if (flags  CPU_DUMP_CODE) {
+target_ulong base = env-segs[R_CS].base + env-eip;
+target_ulong offs = MIN(env-eip, DUMP_CODE_BYTES_BACKWARD);
+uint8_t code;
+char codestr[3];
+
+cpu_fprintf(f, Code=);
+for (i = 0; i  DUMP_CODE_BYTES_TOTAL; i++) {
+if (cpu_memory_rw_debug(env, base - offs + i, code, 1, 0) == 0) {
+snprintf(codestr, sizeof(codestr), %02x, code);
+} else {
+snprintf(codestr, sizeof(codestr), ??);
+}
+cpu_fprintf(f, %s%s%s%s, i  0 ?  : ,
+i == offs ?  : , codestr, i == offs ?  : );
+}
+cpu_fprintf(f, \n);
+}
 }
 
 /***/
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 14/35] kvm: Drop return value of kvm_cpu_exec

From: Jan Kiszka jan.kis...@siemens.com

It is not used, it is not needed, so let's remove it.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 kvm-all.c  |6 ++
 kvm-stub.c |4 ++--
 kvm.h  |2 +-
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/kvm-all.c b/kvm-all.c
index 2538283..7518f2c 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -850,7 +850,7 @@ void kvm_cpu_synchronize_post_init(CPUState *env)
 env-kvm_vcpu_dirty = 0;
 }
 
-int kvm_cpu_exec(CPUState *env)
+void kvm_cpu_exec(CPUState *env)
 {
 struct kvm_run *run = env-kvm_run;
 int ret;
@@ -943,7 +943,7 @@ int kvm_cpu_exec(CPUState *env)
 #ifdef KVM_CAP_SET_GUEST_DEBUG
 if (kvm_arch_debug(run-debug.arch)) {
 env-exception_index = EXCP_DEBUG;
-return 0;
+return;
 }
 /* re-enter, this exception was guest-internal */
 ret = 1;
@@ -960,8 +960,6 @@ int kvm_cpu_exec(CPUState *env)
 env-exit_request = 0;
 env-exception_index = EXCP_INTERRUPT;
 }
-
-return ret;
 }
 
 int kvm_ioctl(KVMState *s, int type, ...)
diff --git a/kvm-stub.c b/kvm-stub.c
index 5384a4b..352c6a6 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -79,9 +79,9 @@ void kvm_cpu_synchronize_post_init(CPUState *env)
 {
 }
 
-int kvm_cpu_exec(CPUState *env)
+void kvm_cpu_exec(CPUState *env)
 {
-abort ();
+abort();
 }
 
 int kvm_has_sync_mmu(void)
diff --git a/kvm.h b/kvm.h
index 60a9b42..51ad56f 100644
--- a/kvm.h
+++ b/kvm.h
@@ -46,7 +46,7 @@ int kvm_has_xcrs(void);
 #ifdef NEED_CPU_H
 int kvm_init_vcpu(CPUState *env);
 
-int kvm_cpu_exec(CPUState *env);
+void kvm_cpu_exec(CPUState *env);
 
 #if !defined(CONFIG_USER_ONLY)
 int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size);
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 26/35] kvm: Eliminate KVMState arguments

From: Jan Kiszka jan.kis...@siemens.com

QEMU supports only one VM, so there is only one kvm_state per process,
and we gain nothing passing a reference to it around. Eliminate any need
to refer to it outside of kvm-all.c.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
CC: Alexander Graf ag...@suse.de
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 cpu-defs.h|2 -
 kvm-all.c |  232 +
 kvm-stub.c|2 +-
 kvm.h |   15 +--
 target-i386/cpuid.c   |9 +-
 target-i386/kvm.c |   77 
 target-i386/kvm_x86.h |3 +
 target-ppc/kvm.c  |   12 ++--
 target-s390x/kvm.c|8 +--
 9 files changed, 160 insertions(+), 200 deletions(-)

diff --git a/cpu-defs.h b/cpu-defs.h
index 8d4bf86..0e04239 100644
--- a/cpu-defs.h
+++ b/cpu-defs.h
@@ -131,7 +131,6 @@ typedef struct icount_decr_u16 {
 #endif
 
 struct kvm_run;
-struct KVMState;
 struct qemu_work_item;
 
 typedef struct CPUBreakpoint {
@@ -207,7 +206,6 @@ typedef struct CPUWatchpoint {
 struct QemuCond *halt_cond; \
 struct qemu_work_item *queued_work_first, *queued_work_last;\
 const char *cpu_model_str;  \
-struct KVMState *kvm_state; \
 struct kvm_run *kvm_run;\
 int kvm_fd; \
 int kvm_vcpu_dirty;
diff --git a/kvm-all.c b/kvm-all.c
index ef2ca3b..d8820c7 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -52,8 +52,7 @@ typedef struct KVMSlot
 
 typedef struct kvm_dirty_log KVMDirtyLog;
 
-struct KVMState
-{
+static struct KVMState {
 KVMSlot slots[32];
 int fd;
 int vmfd;
@@ -72,21 +71,19 @@ struct KVMState
 int irqchip_in_kernel;
 int pit_in_kernel;
 int xsave, xcrs;
-};
-
-static KVMState *kvm_state;
+} kvm_state;
 
-static KVMSlot *kvm_alloc_slot(KVMState *s)
+static KVMSlot *kvm_alloc_slot(void)
 {
 int i;
 
-for (i = 0; i  ARRAY_SIZE(s-slots); i++) {
+for (i = 0; i  ARRAY_SIZE(kvm_state.slots); i++) {
 /* KVM private memory slots */
 if (i = 8  i  12) {
 continue;
 }
-if (s-slots[i].memory_size == 0) {
-return s-slots[i];
+if (kvm_state.slots[i].memory_size == 0) {
+return kvm_state.slots[i];
 }
 }
 
@@ -94,14 +91,13 @@ static KVMSlot *kvm_alloc_slot(KVMState *s)
 abort();
 }
 
-static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
- target_phys_addr_t start_addr,
+static KVMSlot *kvm_lookup_matching_slot(target_phys_addr_t start_addr,
  target_phys_addr_t end_addr)
 {
 int i;
 
-for (i = 0; i  ARRAY_SIZE(s-slots); i++) {
-KVMSlot *mem = s-slots[i];
+for (i = 0; i  ARRAY_SIZE(kvm_state.slots); i++) {
+KVMSlot *mem = kvm_state.slots[i];
 
 if (start_addr == mem-start_addr 
 end_addr == mem-start_addr + mem-memory_size) {
@@ -115,15 +111,14 @@ static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
 /*
  * Find overlapping slot with lowest start address
  */
-static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
-target_phys_addr_t start_addr,
+static KVMSlot *kvm_lookup_overlapping_slot(target_phys_addr_t start_addr,
 target_phys_addr_t end_addr)
 {
 KVMSlot *found = NULL;
 int i;
 
-for (i = 0; i  ARRAY_SIZE(s-slots); i++) {
-KVMSlot *mem = s-slots[i];
+for (i = 0; i  ARRAY_SIZE(kvm_state.slots); i++) {
+KVMSlot *mem = kvm_state.slots[i];
 
 if (mem-memory_size == 0 ||
 (found  found-start_addr  mem-start_addr)) {
@@ -139,13 +134,13 @@ static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
 return found;
 }
 
-int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
+int kvm_physical_memory_addr_from_ram(ram_addr_t ram_addr,
   target_phys_addr_t *phys_addr)
 {
 int i;
 
-for (i = 0; i  ARRAY_SIZE(s-slots); i++) {
-KVMSlot *mem = s-slots[i];
+for (i = 0; i  ARRAY_SIZE(kvm_state.slots); i++) {
+KVMSlot *mem = kvm_state.slots[i];
 
 if (ram_addr = mem-phys_offset 
 ram_addr  mem-phys_offset + mem-memory_size) {
@@ -157,7 +152,7 @@ int kvm_physical_memory_addr_from_ram(KVMState *s, 
ram_addr_t ram_addr,
 return 0;
 }
 
-static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
+static int kvm_set_user_memory_region(KVMSlot *slot)
 {
 struct kvm_userspace_memory_region mem;
 
@@ -166,10 +161,10 @@ static int kvm_set_user_memory_region(KVMState *s, 
KVMSlot *slot)
 mem.memory_size = slot-memory_size;
 mem.userspace_addr = (unsigned

[PATCH 12/35] kvm: x86: Fix a few coding style violations

From: Jan Kiszka jan.kis...@siemens.com

No functional changes.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 target-i386/kvm.c |  335 +
 1 files changed, 182 insertions(+), 153 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 85edacc..fda07d2 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -150,34 +150,34 @@ uint32_t kvm_arch_get_supported_cpuid(CPUState *env, 
uint32_t function,
 
 #ifdef CONFIG_KVM_PARA
 struct kvm_para_features {
-int cap;
-int feature;
+int cap;
+int feature;
 } para_features[] = {
 #ifdef KVM_CAP_CLOCKSOURCE
-{ KVM_CAP_CLOCKSOURCE, KVM_FEATURE_CLOCKSOURCE },
+{ KVM_CAP_CLOCKSOURCE, KVM_FEATURE_CLOCKSOURCE },
 #endif
 #ifdef KVM_CAP_NOP_IO_DELAY
-{ KVM_CAP_NOP_IO_DELAY, KVM_FEATURE_NOP_IO_DELAY },
+{ KVM_CAP_NOP_IO_DELAY, KVM_FEATURE_NOP_IO_DELAY },
 #endif
 #ifdef KVM_CAP_PV_MMU
-{ KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP },
+{ KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP },
 #endif
 #ifdef KVM_CAP_ASYNC_PF
-{ KVM_CAP_ASYNC_PF, KVM_FEATURE_ASYNC_PF },
+{ KVM_CAP_ASYNC_PF, KVM_FEATURE_ASYNC_PF },
 #endif
-{ -1, -1 }
+{ -1, -1 }
 };
 
 static int get_para_features(CPUState *env)
 {
-int i, features = 0;
+int i, features = 0;
 
-for (i = 0; i  ARRAY_SIZE(para_features) - 1; i++) {
-if (kvm_check_extension(env-kvm_state, para_features[i].cap))
-features |= (1  para_features[i].feature);
+for (i = 0; i  ARRAY_SIZE(para_features) - 1; i++) {
+if (kvm_check_extension(env-kvm_state, para_features[i].cap)) {
+features |= (1  para_features[i].feature);
 }
-
-return features;
+}
+return features;
 }
 #endif
 
@@ -389,13 +389,15 @@ int kvm_arch_init_vcpu(CPUState *env)
 c-index = j;
 cpu_x86_cpuid(env, i, j, c-eax, c-ebx, c-ecx, c-edx);
 
-if (i == 4  c-eax == 0)
+if (i == 4  c-eax == 0) {
 break;
-if (i == 0xb  !(c-ecx  0xff00))
+}
+if (i == 0xb  !(c-ecx  0xff00)) {
 break;
-if (i == 0xd  c-eax == 0)
+}
+if (i == 0xd  c-eax == 0) {
 break;
-
+}
 c = cpuid_data.entries[cpuid_i++];
 }
 break;
@@ -425,17 +427,18 @@ int kvm_arch_init_vcpu(CPUState *env)
 uint64_t mcg_cap;
 int banks;
 
-if (kvm_get_mce_cap_supported(env-kvm_state, mcg_cap, banks))
+if (kvm_get_mce_cap_supported(env-kvm_state, mcg_cap, banks)) {
 perror(kvm_get_mce_cap_supported FAILED);
-else {
+} else {
 if (banks  MCE_BANKS_DEF)
 banks = MCE_BANKS_DEF;
 mcg_cap = MCE_CAP_DEF;
 mcg_cap |= banks;
-if (kvm_setup_mce(env, mcg_cap))
+if (kvm_setup_mce(env, mcg_cap)) {
 perror(kvm_setup_mce FAILED);
-else
+} else {
 env-mcg_cap = mcg_cap;
+}
 }
 }
 #endif
@@ -577,7 +580,7 @@ int kvm_arch_init(KVMState *s, int smp_cpus)
 
 return kvm_init_identity_map_page(s);
 }
-
+
 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
 {
 lhs-selector = rhs-selector;
@@ -616,23 +619,23 @@ static void get_seg(SegmentCache *lhs, const struct 
kvm_segment *rhs)
 lhs-selector = rhs-selector;
 lhs-base = rhs-base;
 lhs-limit = rhs-limit;
-lhs-flags =
-   (rhs-type  DESC_TYPE_SHIFT)
-   | (rhs-present * DESC_P_MASK)
-   | (rhs-dpl  DESC_DPL_SHIFT)
-   | (rhs-db  DESC_B_SHIFT)
-   | (rhs-s * DESC_S_MASK)
-   | (rhs-l  DESC_L_SHIFT)
-   | (rhs-g * DESC_G_MASK)
-   | (rhs-avl * DESC_AVL_MASK);
+lhs-flags = (rhs-type  DESC_TYPE_SHIFT) |
+ (rhs-present * DESC_P_MASK) |
+ (rhs-dpl  DESC_DPL_SHIFT) |
+ (rhs-db  DESC_B_SHIFT) |
+ (rhs-s * DESC_S_MASK) |
+ (rhs-l  DESC_L_SHIFT) |
+ (rhs-g * DESC_G_MASK) |
+ (rhs-avl * DESC_AVL_MASK);
 }
 
 static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
 {
-if (set)
+if (set) {
 *kvm_reg = *qemu_reg;
-else
+} else {
 *qemu_reg = *kvm_reg;
+}
 }
 
 static int kvm_getput_regs(CPUState *env, int set)
@@ -642,8 +645,9 @@ static int kvm_getput_regs(CPUState *env, int set)
 
 if (!set) {
 ret = kvm_vcpu_ioctl(env, KVM_GET_REGS, regs);
-if (ret  0)
+if (ret  0) {
 return ret;
+}
 }
 
 kvm_getput_reg(regs.rax, env-regs[R_EAX], set);
@@ -668,8 +672,9 @@ static int kvm_getput_regs(CPUState *env, int set)

[PATCH 02/35] kvm: convert kvm_ioctl(KVM_CHECK_EXTENSION) to kvm_check_extension()

From: Lai Jiangshan la...@cn.fujitsu.com

simple cleanup and use existing helper: kvm_check_extension().

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 kvm-all.c |2 +-
 target-i386/kvm.c |4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/kvm-all.c b/kvm-all.c
index cae24bb..35fc73c 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -678,7 +678,7 @@ int kvm_init(int smp_cpus)
 
 s-broken_set_mem_region = 1;
 #ifdef KVM_CAP_JOIN_MEMORY_REGIONS_WORKS
-ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
+ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
 if (ret  0) {
 s-broken_set_mem_region = 0;
 }
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 755f8c9..4004de7 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -187,7 +187,7 @@ static int kvm_get_mce_cap_supported(KVMState *s, uint64_t 
*mce_cap,
 {
 int r;
 
-r = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_MCE);
+r = kvm_check_extension(s, KVM_CAP_MCE);
 if (r  0) {
 *max_banks = r;
 return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
@@ -540,7 +540,7 @@ int kvm_arch_init(KVMState *s, int smp_cpus)
  * versions of KVM just assumed that it would be at the end of physical
  * memory but that doesn't work with more than 4GB of memory.  We simply
  * refuse to work with those older versions of KVM. */
-ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_SET_TSS_ADDR);
+ret = kvm_check_extension(s, KVM_CAP_SET_TSS_ADDR);
 if (ret = 0) {
 fprintf(stderr, kvm does not support KVM_CAP_SET_TSS_ADDR\n);
 return ret;
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 13/35] kvm: Fix coding style violations

From: Jan Kiszka jan.kis...@siemens.com

No functional changes.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 kvm-all.c |  139 ++--
 1 files changed, 79 insertions(+), 60 deletions(-)

diff --git a/kvm-all.c b/kvm-all.c
index 35fc73c..2538283 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -82,10 +82,12 @@ static KVMSlot *kvm_alloc_slot(KVMState *s)
 
 for (i = 0; i  ARRAY_SIZE(s-slots); i++) {
 /* KVM private memory slots */
-if (i = 8  i  12)
+if (i = 8  i  12) {
 continue;
-if (s-slots[i].memory_size == 0)
+}
+if (s-slots[i].memory_size == 0) {
 return s-slots[i];
+}
 }
 
 fprintf(stderr, %s: no free slot available\n, __func__);
@@ -220,9 +222,10 @@ int kvm_init_vcpu(CPUState *env)
 }
 
 #ifdef KVM_CAP_COALESCED_MMIO
-if (s-coalesced_mmio  !s-coalesced_mmio_ring)
-s-coalesced_mmio_ring = (void *) env-kvm_run +
-   s-coalesced_mmio * PAGE_SIZE;
+if (s-coalesced_mmio  !s-coalesced_mmio_ring) {
+s-coalesced_mmio_ring =
+(void *)env-kvm_run + s-coalesced_mmio * PAGE_SIZE;
+}
 #endif
 
 ret = kvm_arch_init_vcpu(env);
@@ -269,16 +272,14 @@ static int kvm_dirty_pages_log_change(target_phys_addr_t 
phys_addr,
 
 int kvm_log_start(target_phys_addr_t phys_addr, ram_addr_t size)
 {
-return kvm_dirty_pages_log_change(phys_addr, size,
-  KVM_MEM_LOG_DIRTY_PAGES,
-  KVM_MEM_LOG_DIRTY_PAGES);
+return kvm_dirty_pages_log_change(phys_addr, size, KVM_MEM_LOG_DIRTY_PAGES,
+  KVM_MEM_LOG_DIRTY_PAGES);
 }
 
 int kvm_log_stop(target_phys_addr_t phys_addr, ram_addr_t size)
 {
-return kvm_dirty_pages_log_change(phys_addr, size,
-  0,
-  KVM_MEM_LOG_DIRTY_PAGES);
+return kvm_dirty_pages_log_change(phys_addr, size, 0,
+  KVM_MEM_LOG_DIRTY_PAGES);
 }
 
 static int kvm_set_migration_log(int enable)
@@ -350,7 +351,7 @@ static int kvm_get_dirty_pages_log_range(unsigned long 
start_addr,
  * @end_addr: end of logged region.
  */
 static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
- target_phys_addr_t end_addr)
+  target_phys_addr_t end_addr)
 {
 KVMState *s = kvm_state;
 unsigned long size, allocated_size = 0;
@@ -441,9 +442,8 @@ int kvm_check_extension(KVMState *s, unsigned int extension)
 return ret;
 }
 
-static void kvm_set_phys_mem(target_phys_addr_t start_addr,
-ram_addr_t size,
-ram_addr_t phys_offset)
+static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
+ ram_addr_t phys_offset)
 {
 KVMState *s = kvm_state;
 ram_addr_t flags = phys_offset  ~TARGET_PAGE_MASK;
@@ -550,13 +550,13 @@ static void kvm_set_phys_mem(target_phys_addr_t 
start_addr,
 }
 
 /* in case the KVM bug workaround already consumed the new slot */
-if (!size)
+if (!size) {
 return;
-
+}
 /* KVM does not need to know about this memory */
-if (flags = IO_MEM_UNASSIGNED)
+if (flags = IO_MEM_UNASSIGNED) {
 return;
-
+}
 mem = kvm_alloc_slot(s);
 mem-memory_size = size;
 mem-start_addr = start_addr;
@@ -572,30 +572,29 @@ static void kvm_set_phys_mem(target_phys_addr_t 
start_addr,
 }
 
 static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
- target_phys_addr_t start_addr,
- ram_addr_t size,
- ram_addr_t phys_offset)
+  target_phys_addr_t start_addr,
+  ram_addr_t size, ram_addr_t phys_offset)
 {
-   kvm_set_phys_mem(start_addr, size, phys_offset);
+kvm_set_phys_mem(start_addr, size, phys_offset);
 }
 
 static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
-   target_phys_addr_t start_addr,
-   target_phys_addr_t end_addr)
+target_phys_addr_t start_addr,
+target_phys_addr_t end_addr)
 {
-   return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
+return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
 }
 
 static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
-   int enable)
+int enable)
 {
-   return kvm_set_migration_log(enable);
+return kvm_set_migration_log(enable);
 }
 
 static CPUPhysMemoryClient

[PATCH 31/35] kvm: x86: Rework identity map and TSS setup for larger BIOS sizes

From: Jan Kiszka jan.kis...@siemens.com

In order to support loading BIOSes  256K, reorder the code, adjusting
the base if the kernel supports moving the identity map.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 target-i386/kvm.c |   63 +---
 1 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 58122d9..50d8ec8 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -578,27 +578,9 @@ static int kvm_get_supported_msrs(void)
 return ret;
 }
 
-static int kvm_init_identity_map_page(void)
-{
-#ifdef KVM_CAP_SET_IDENTITY_MAP_ADDR
-int ret;
-uint64_t addr = 0xfffbc000;
-
-if (!kvm_check_extension(KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
-return 0;
-}
-
-ret = kvm_vm_ioctl(KVM_SET_IDENTITY_MAP_ADDR, addr);
-if (ret  0) {
-fprintf(stderr, kvm_set_identity_map_addr: %s\n, strerror(ret));
-return ret;
-}
-#endif
-return 0;
-}
-
 int kvm_arch_init(void)
 {
+uint64_t identity_base = 0xfffbc000;
 int ret;
 struct utsname utsname;
 
@@ -614,27 +596,42 @@ int kvm_arch_init(void)
 uname(utsname);
 lm_capable_kernel = strcmp(utsname.machine, x86_64) == 0;
 
-/* create vm86 tss.  KVM uses vm86 mode to emulate 16-bit code
- * directly.  In order to use vm86 mode, a TSS is needed.  Since this
- * must be part of guest physical memory, we need to allocate it. */
-
-/* this address is 3 pages before the bios, and the bios should present
- * as unavaible memory.  FIXME, need to ensure the e820 map deals with
- * this?
- */
 /*
- * Tell fw_cfg to notify the BIOS to reserve the range.
+ * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
+ * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
+ * Since these must be part of guest physical memory, we need to allocate
+ * them, both by setting their start addresses in the kernel and by
+ * creating a corresponding e820 entry. We need 4 pages before the BIOS.
+ *
+ * Older KVM versions may not support setting the identity map base. In
+ * that case we need to stick with the default, i.e. a 256K maximum BIOS
+ * size.
  */
-if (e820_add_entry(0xfffbc000, 0x4000, E820_RESERVED)  0) {
-perror(e820_add_entry() table is full);
-exit(1);
+#ifdef KVM_CAP_SET_IDENTITY_MAP_ADDR
+if (kvm_check_extension(KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
+/* Allows up to 16M BIOSes. */
+identity_base = 0xfeffc000;
+
+ret = kvm_vm_ioctl(KVM_SET_IDENTITY_MAP_ADDR, identity_base);
+if (ret  0) {
+return ret;
+}
 }
-ret = kvm_vm_ioctl(KVM_SET_TSS_ADDR, 0xfffbd000);
+#endif
+/* Set TSS base one page after EPT identity map. */
+ret = kvm_vm_ioctl(KVM_SET_TSS_ADDR, identity_base + 0x1000);
+if (ret  0) {
+return ret;
+}
+
+/* Tell fw_cfg to notify the BIOS to reserve the range. */
+ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
 if (ret  0) {
+fprintf(stderr, e820_add_entry() table is full\n);
 return ret;
 }
 
-return kvm_init_identity_map_page();
+return 0;
 }
 
 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 28/35] kvm: x86: Introduce kvmclock device to save/restore its state

From: Jan Kiszka jan.kis...@siemens.com

If kvmclock is used, which implies the kernel supports it, register a
kvmclock device with the sysbus. Its main purpose is to save and restore
the kernel state on migration, but this will also allow to visualize it
one day.

Signed-off-by: Jan Kiszka jan.kis...@siemens.com
CC: Glauber Costa glom...@redhat.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 target-i386/kvm.c |   92 -
 1 files changed, 91 insertions(+), 1 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 69b8234..47cb22b 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -29,6 +29,7 @@
 #include hw/apic.h
 #include ioport.h
 #include kvm_x86.h
+#include hw/sysbus.h
 
 #ifdef CONFIG_KVM_PARA
 #include linux/kvm_para.h
@@ -309,6 +310,85 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t 
status,
 #endif
 }
 
+#if defined(CONFIG_KVM_PARA)  defined(KVM_CAP_ADJUST_CLOCK)
+typedef struct KVMClockState {
+SysBusDevice busdev;
+uint64_t clock;
+bool clock_valid;
+} KVMClockState;
+
+static void kvmclock_pre_save(void *opaque)
+{
+KVMClockState *s = opaque;
+struct kvm_clock_data data;
+int ret;
+
+if (s-clock_valid) {
+return;
+}
+ret = kvm_vm_ioctl(KVM_GET_CLOCK, data);
+if (ret  0) {
+fprintf(stderr, KVM_GET_CLOCK failed: %s\n, strerror(ret));
+data.clock = 0;
+}
+s-clock = data.clock;
+/*
+ * If the VM is stopped, declare the clock state valid to avoid re-reading
+ * it on next vmsave (which would return a different value). Will be reset
+ * when the VM is continued.
+ */
+s-clock_valid = !vm_running;
+}
+
+static int kvmclock_post_load(void *opaque, int version_id)
+{
+KVMClockState *s = opaque;
+struct kvm_clock_data data;
+
+data.clock = s-clock;
+data.flags = 0;
+return kvm_vm_ioctl(KVM_SET_CLOCK, data);
+}
+
+static void kvmclock_vm_state_change(void *opaque, int running, int reason)
+{
+KVMClockState *s = opaque;
+
+if (running) {
+s-clock_valid = false;
+}
+}
+
+static int kvmclock_init(SysBusDevice *dev)
+{
+KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
+
+qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
+return 0;
+}
+
+static const VMStateDescription kvmclock_vmsd= {
+.name = kvmclock,
+.version_id = 1,
+.minimum_version_id = 1,
+.minimum_version_id_old = 1,
+.pre_save = kvmclock_pre_save,
+.post_load = kvmclock_post_load,
+.fields = (VMStateField []) {
+VMSTATE_UINT64(clock, KVMClockState),
+VMSTATE_END_OF_LIST()
+}
+};
+
+static SysBusDeviceInfo kvmclock_info = {
+.qdev.name = kvmclock,
+.qdev.size = sizeof(KVMClockState),
+.qdev.vmsd = kvmclock_vmsd,
+.qdev.no_user = 1,
+.init = kvmclock_init,
+};
+#endif /* CONFIG_KVM_PARA  KVM_CAP_ADJUST_CLOCK */
+
 int kvm_arch_init_vcpu(CPUState *env)
 {
 struct {
@@ -335,7 +415,6 @@ int kvm_arch_init_vcpu(CPUState *env)
 env-cpuid_svm_features  = kvm_x86_get_supported_cpuid(0x800A,
 0, R_EDX);
 
-
 cpuid_i = 0;
 
 #ifdef CONFIG_KVM_PARA
@@ -442,6 +521,13 @@ int kvm_arch_init_vcpu(CPUState *env)
 }
 #endif
 
+#if defined(CONFIG_KVM_PARA)  defined(KVM_CAP_ADJUST_CLOCK)
+if (cpu_is_bsp(env) 
+(env-cpuid_kvm_features  (1ULL  KVM_FEATURE_CLOCKSOURCE))) {
+sysbus_create_simple(kvmclock, -1, NULL);
+}
+#endif
+
 return kvm_vcpu_ioctl(env, KVM_SET_CPUID2, cpuid_data);
 }
 
@@ -531,6 +617,10 @@ int kvm_arch_init(int smp_cpus)
 int ret;
 struct utsname utsname;
 
+#if defined(CONFIG_KVM_PARA)  defined(KVM_CAP_ADJUST_CLOCK)
+sysbus_register_withprop(kvmclock_info);
+#endif
+
 ret = kvm_get_supported_msrs();
 if (ret  0) {
 return ret;
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 08/35] kvm: introduce kvm_inject_x86_mce_on

From: Jin Dongming jin.dongm...@np.css.fujitsu.com

Pass a table instead of multiple args.

Note:

kvm_inject_x86_mce(env, bank, status, mcg_status, addr, misc,
   abort_on_error);

is equal to:

struct kvm_x86_mce mce = {
.bank = bank,
.status = status,
.mcg_status = mcg_status,
.addr = addr,
.misc = misc,
};
kvm_inject_x86_mce_on(env, mce, abort_on_error);

Signed-off-by: Hidetoshi Seto seto.hideto...@jp.fujitsu.com
Signed-off-by: Jin Dongming jin.dongm...@np.css.fujitsu.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
---
 target-i386/kvm.c |   57 +---
 1 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index ce01e18..9a4bf98 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -263,6 +263,23 @@ static void kvm_do_inject_x86_mce(void *_data)
 }
 }
 
+static void kvm_inject_x86_mce_on(CPUState *env, struct kvm_x86_mce *mce,
+  int flag)
+{
+struct kvm_x86_mce_data data = {
+.env = env,
+.mce = mce,
+.abort_on_error = (flag  ABORT_ON_ERROR),
+};
+
+if (!env-mcg_cap) {
+fprintf(stderr, MCE support is not enabled!\n);
+return;
+}
+
+run_on_cpu(env, kvm_do_inject_x86_mce, data);
+}
+
 static void kvm_mce_broadcast_rest(CPUState *env);
 #endif
 
@@ -278,21 +295,12 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank, 
uint64_t status,
 .addr = addr,
 .misc = misc,
 };
-struct kvm_x86_mce_data data = {
-.env = cenv,
-.mce = mce,
-};
-
-if (!cenv-mcg_cap) {
-fprintf(stderr, MCE support is not enabled!\n);
-return;
-}
 
 if (flag  MCE_BROADCAST) {
 kvm_mce_broadcast_rest(cenv);
 }
 
-run_on_cpu(cenv, kvm_do_inject_x86_mce, data);
+kvm_inject_x86_mce_on(cenv, mce, flag);
 #else
 if (flag  ABORT_ON_ERROR) {
 abort();
@@ -1708,6 +1716,13 @@ static void hardware_memory_error(void)
 #ifdef KVM_CAP_MCE
 static void kvm_mce_broadcast_rest(CPUState *env)
 {
+struct kvm_x86_mce mce = {
+.bank = 1,
+.status = MCI_STATUS_VAL | MCI_STATUS_UC,
+.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV,
+.addr = 0,
+.misc = 0,
+};
 CPUState *cenv;
 
 /* Broadcast MCA signal for processor version 06H_EH and above */
@@ -1716,9 +1731,7 @@ static void kvm_mce_broadcast_rest(CPUState *env)
 if (cenv == env) {
 continue;
 }
-kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
-   MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0,
-   ABORT_ON_ERROR);
+kvm_inject_x86_mce_on(cenv, mce, ABORT_ON_ERROR);
 }
 }
 }
@@ -1767,15 +1780,17 @@ static void kvm_mce_inj_srao_memscrub(CPUState *env, 
target_phys_addr_t paddr)
 
 static void kvm_mce_inj_srao_memscrub2(CPUState *env, target_phys_addr_t paddr)
 {
-uint64_t status;
-
-status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
-| MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
-| 0xc0;
-kvm_inject_x86_mce(env, 9, status,
-   MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
-   (MCM_ADDR_PHYS  6) | 0xc, ABORT_ON_ERROR);
+struct kvm_x86_mce mce = {
+.bank = 9,
+.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+  | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+  | 0xc0,
+.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV,
+.addr = paddr,
+.misc = (MCM_ADDR_PHYS  6) | 0xc,
+};
 
+kvm_inject_x86_mce_on(env, mce, ABORT_ON_ERROR);
 kvm_mce_broadcast_rest(env);
 }
 
-- 
1.7.2.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 35/35] kvm: x86: Only read/write MSR_KVM_ASYNC_PF_EN if supported