Re: [PATCH] always report x2apic as supported feature

2009-07-16 Thread Gleb Natapov
On Thu, Jul 16, 2009 at 09:46:21AM +0800, Sheng Yang wrote:
 On Thursday 16 July 2009 07:01:30 Marcelo Tosatti wrote:
  On Sun, Jul 12, 2009 at 04:10:55PM +0300, Gleb Natapov wrote:
   We emulate x2apic in software, so host support is not required.
  
   Signed-off-by: Gleb Natapov g...@redhat.com
   diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
   index 00844eb..c256da7 100644
   --- a/arch/x86/kvm/x86.c
   +++ b/arch/x86/kvm/x86.c
   @@ -1497,6 +1497,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2
   *entry, u32 function, case 1:
 entry-edx = kvm_supported_word0_x86_features;
 entry-ecx = kvm_supported_word4_x86_features;
   + /* we support x2apic emulation even if host does not support
   +it since we emulate x2apic in software */
   + entry-ecx |= F(X2APIC);
 break;
 /* function 2 entries are STATEFUL. That is, repeated cpuid commands
  * may return different values. This forces us to get_cpu() before
   --
 Gleb.
 
  What if you have an older host that does not support emulate x2apic?
 
 Due to interrupt remapping can't be enabled with KVM now, I think older host 
 would just ignore this info... (The new one can work without interrupt 
 remapping with KVM).
 
 By the way, I saw X2APIC in host supported CPUID feature list(1.ecx), which I 
Where have you seen it? If you mean kvm_supported_word4_x86_features
then it is not what is supported by the host, but what is supported by
KVM. Host unsupported bits are dropped from there before reporting to
userspace. That is why this patch what necessary.

 don't think it's very properly. Host x2apic feature have nothing to do with 
 KVM, we do the emulation all the way. I suggest to remove the mask for host, 
 and give a comment that we would emulate all x2apic behaviour here, rather 
 than even if, which I think it's a little misleading.
 
 -- 
 regards
 Yang, Sheng

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 10/10] Change irq routing table to use gsi indexed array.

2009-07-16 Thread Gleb Natapov
On Wed, Jul 15, 2009 at 06:42:05PM -0300, Marcelo Tosatti wrote:
 On Wed, Jul 15, 2009 at 11:52:24PM +0300, Gleb Natapov wrote:
  On Wed, Jul 15, 2009 at 03:18:00PM -0300, Marcelo Tosatti wrote:
   On Tue, Jul 14, 2009 at 05:30:45PM +0300, Gleb Natapov wrote:
Use gsi indexed array instead of scanning all entries on each interrupt
injection. Also maintain back mapping from irqchip/pin to gsi to speedup
interrupt acknowledgment notifications.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 include/linux/kvm_host.h |   11 ++-
 virt/kvm/irq_comm.c  |   62 
-
 2 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index aa64d0d..ae6cbf1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -128,7 +128,14 @@ struct kvm_kernel_irq_routing_entry {
} irqchip;
struct msi_msg msi;
};
-   struct list_head link;
+   struct hlist_node link;
+};
+
+struct kvm_irq_routing_table {
+   int chip[3][KVM_IOAPIC_NUM_PINS];
+   struct kvm_kernel_irq_routing_entry *rt_entries;
+   u32 max_gsi;
+   struct hlist_head map[0];
 };
 
 struct kvm {
@@ -165,7 +172,7 @@ struct kvm {
 #endif
 
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
-   struct kvm_kernel_irq_routing_entry *irq_routing;
+   struct kvm_irq_routing_table *irq_routing;
spinlock_t irq_routing_lock;
struct hlist_head mask_notifier_list;
struct hlist_head irq_ack_notifier_list;
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index c54a28b..da643d4 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -125,6 +125,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, 
int irq, int level)
struct kvm_kernel_irq_routing_entry *e;
unsigned long *irq_state, sig_level;
int ret = -1;
+   struct kvm_irq_routing_table *irq_rt;
+   struct hlist_node *n;
 
trace_kvm_set_irq(irq, level, irq_source_id);
 
@@ -147,14 +149,13 @@ int kvm_set_irq(struct kvm *kvm, int 
irq_source_id, int irq, int level)
 * writes to the unused one.
 */
rcu_read_lock();
-   for (e = rcu_dereference(kvm-irq_routing); e  e-set; e++) {
-   if (e-gsi == irq) {
-   int r = e-set(e, kvm, sig_level);
-   if (r  0)
-   continue;
+   irq_rt = rcu_dereference(kvm-irq_routing);
+   hlist_for_each_entry(e, n, irq_rt-map[irq], link) {
+   int r = e-set(e, kvm, sig_level);
+   if (r  0)
+   continue;
 
-   ret = r + ((ret  0) ? 0 : ret);
-   }
+   ret = r + ((ret  0) ? 0 : ret);
}
rcu_read_unlock();
return ret;
@@ -162,21 +163,16 @@ int kvm_set_irq(struct kvm *kvm, int 
irq_source_id, int irq, int level)
 
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned 
pin)
 {
-   struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_ack_notifier *kian;
struct hlist_node *n;
-   unsigned gsi = pin;
+   unsigned gsi;
 
trace_kvm_ack_irq(irqchip, pin);
 
rcu_read_lock();
-   for (e = rcu_dereference(kvm-irq_routing); e  e-set; e++) {
-   if (e-irqchip.irqchip == irqchip 
-   e-irqchip.pin == pin) {
-   gsi = e-gsi;
-   break;
-   }
-   }
+   gsi = rcu_dereference(kvm-irq_routing)-chip[irqchip][pin];
+   if (gsi == -1)
+   gsi = pin;
 
hlist_for_each_entry_rcu(kian, n, kvm-irq_ack_notifier_list, 
link)
if (kian-gsi == gsi)
@@ -277,7 +273,8 @@ void kvm_free_irq_routing(struct kvm *kvm)
kfree(kvm-irq_routing);
 }
 
-static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+  struct kvm_kernel_irq_routing_entry *e,
   const struct kvm_irq_routing_entry *ue)
 {
int r = -EINVAL;
@@ -303,6 +300,7 @@ static int setup_routing_entry(struct 
kvm_kernel_irq_routing_entry *e,
}
e-irqchip.irqchip = ue-u.irqchip.irqchip;
e-irqchip.pin = ue-u.irqchip.pin + delta;
+   rt-chip[ue-u.irqchip.irqchip][e-irqchip.pin] = 
ue-gsi;
break;
case 

Re: [PATCH] always report x2apic as supported feature

2009-07-16 Thread Sheng Yang
On Thursday 16 July 2009 14:00:15 Gleb Natapov wrote:
 On Thu, Jul 16, 2009 at 09:46:21AM +0800, Sheng Yang wrote:
  On Thursday 16 July 2009 07:01:30 Marcelo Tosatti wrote:
   On Sun, Jul 12, 2009 at 04:10:55PM +0300, Gleb Natapov wrote:
We emulate x2apic in software, so host support is not required.
   
Signed-off-by: Gleb Natapov g...@redhat.com
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00844eb..c256da7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1497,6 +1497,9 @@ static void do_cpuid_ent(struct
kvm_cpuid_entry2 *entry, u32 function, case 1:
entry-edx = kvm_supported_word0_x86_features;
entry-ecx = kvm_supported_word4_x86_features;
+   /* we support x2apic emulation even if host does not 
support
+  it since we emulate x2apic in software */
+   entry-ecx |= F(X2APIC);
break;
/* function 2 entries are STATEFUL. That is, repeated cpuid
commands * may return different values. This forces us to get_cpu()
before --
Gleb.
  
   What if you have an older host that does not support emulate x2apic?
 
  Due to interrupt remapping can't be enabled with KVM now, I think older
  host would just ignore this info... (The new one can work without
  interrupt remapping with KVM).
 
  By the way, I saw X2APIC in host supported CPUID feature list(1.ecx),
  which I

 Where have you seen it? If you mean kvm_supported_word4_x86_features
 then it is not what is supported by the host, but what is supported by
 KVM. Host unsupported bits are dropped from there before reporting to
 userspace. That is why this patch what necessary.

Yes, that's what I mean. x2apic feature needn't judged by host feature, we can 
always set the bit to support it, no need for a filter. I think put it in the 
kvm_supported_word4_x86_features is a little misleading means that KVM support 
it through host feature rather than emulation.

Anyway, not a big deal.

-- 
regards
Yang, Sheng


  don't think it's very properly. Host x2apic feature have nothing to do
  with KVM, we do the emulation all the way. I suggest to remove the mask
  for host, and give a comment that we would emulate all x2apic behaviour
  here, rather than even if, which I think it's a little misleading.
 
  --
  regards
  Yang, Sheng

 --
   Gleb.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] always report x2apic as supported feature

2009-07-16 Thread Gleb Natapov
On Thu, Jul 16, 2009 at 02:09:09PM +0800, Sheng Yang wrote:
 On Thursday 16 July 2009 14:00:15 Gleb Natapov wrote:
  On Thu, Jul 16, 2009 at 09:46:21AM +0800, Sheng Yang wrote:
   On Thursday 16 July 2009 07:01:30 Marcelo Tosatti wrote:
On Sun, Jul 12, 2009 at 04:10:55PM +0300, Gleb Natapov wrote:
 We emulate x2apic in software, so host support is not required.

 Signed-off-by: Gleb Natapov g...@redhat.com
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index 00844eb..c256da7 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1497,6 +1497,9 @@ static void do_cpuid_ent(struct
 kvm_cpuid_entry2 *entry, u32 function, case 1:
   entry-edx = kvm_supported_word0_x86_features;
   entry-ecx = kvm_supported_word4_x86_features;
 + /* we support x2apic emulation even if host does not 
 support
 +it since we emulate x2apic in software */
 + entry-ecx |= F(X2APIC);
   break;
   /* function 2 entries are STATEFUL. That is, repeated cpuid
 commands * may return different values. This forces us to get_cpu()
 before --
   Gleb.
   
What if you have an older host that does not support emulate x2apic?
  
   Due to interrupt remapping can't be enabled with KVM now, I think older
   host would just ignore this info... (The new one can work without
   interrupt remapping with KVM).
  
   By the way, I saw X2APIC in host supported CPUID feature list(1.ecx),
   which I
 
  Where have you seen it? If you mean kvm_supported_word4_x86_features
  then it is not what is supported by the host, but what is supported by
  KVM. Host unsupported bits are dropped from there before reporting to
  userspace. That is why this patch what necessary.
 
 Yes, that's what I mean. x2apic feature needn't judged by host feature, we 
 can 
 always set the bit to support it, no need for a filter. I think put it in the 
 kvm_supported_word4_x86_features is a little misleading means that KVM 
 support 
 it through host feature rather than emulation.
 
Yeah, I put it there initially since I misunderstood how things work and
thought that it will be reported to userspace (and usercpace had a bug
that prevented me from discovering the problem).

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: guest gettimeofday behavior

2009-07-16 Thread Eran Rom
Eran Rom eranr at il.ibm.com writes:

  When Host and Guest ran 2.6.27 with kvm-87 (both qemu-kvm and kvm-kmod), the
  problem persisted. Thus, I am looking for a kernel fix that is not
 part of KVM,
  any lead? Am confined to use 2.6.27

 Marcelo Tosatti mtosa...@redhat.com wrote on 16/07/2009 01:19:40: 
 Is it an AMD host with cpufreq?

Intel(R) Xeon(TM) CPU 3.00GHz, CONFIG_CPU_FREQ=y
Guest clock source is kvmclock

Thanks very much,
Eran



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM crashes when using certain USB device

2009-07-16 Thread Jim Paris
Hi G,

  I've continued my attempts to get the HASP dongle working, but with no 
  success:
...
 Good idea. The results from three test runs after that change are in
 the attached files. The third was done while also dumping the USB bus,
 and the output from that dump is also attached.

The gdb output here looks questionable.  Only the second trial seems
to have USB related stuff in the backtrace, so either gdb is wrong or
there's some memory corruption that is causing crashes elsewhere.
Maybe valgrind could help?

You can also add more debugging to the usb code to try to figure out
where things are going wrong.  See the attached patch for some printfs
that might help.

Try again with less memory on the guest, like -m 2048, just to reduce
possible problems with the 32-bit guest and address space.

I didn't see anything obviously wrong with the usbmon dumps you sent,
or the debugging that qemu printed out, but I'm not familiar with this
code.

Even though you're having problems with -no-kvm, I suspect this is an
upstream qemu issue, so you should probably try the qemu list too.

-jim
diff -urN kvm-87/usb-linux.c kvm-87-debug/usb-linux.c
--- kvm-87/usb-linux.c	2009-06-23 09:32:38.0 -0400
+++ kvm-87-debug/usb-linux.c	2009-07-16 03:06:22.0 -0400
@@ -209,16 +209,21 @@
 
 static AsyncURB *async_alloc(void)
 {
-return (AsyncURB *) qemu_mallocz(sizeof(AsyncURB));
+AsyncURB *aurb = (AsyncURB *) qemu_mallocz(sizeof(AsyncURB));
+dprintf(husb: allocated %p\n, aurb);
+return aurb;
 }
 
 static void async_free(AsyncURB *aurb)
 {
+dprintf(husb: freeing %p\n, aurb);
 qemu_free(aurb);
 }
 
 static void async_complete_ctrl(USBHostDevice *s, USBPacket *p)
 {
+dprintf(husb: complete ctrl, host state %d len %d\n, 
+	s-ctrl.state, s-ctrl.len);
 switch(s-ctrl.state) {
 case CTRL_STATE_SETUP:
 if (p-len  s-ctrl.len)
@@ -266,6 +271,7 @@
 aurb, aurb-urb.status, aurb-urb.actual_length);
 
 	if (p) {
+	dprintf(husb: p=%p\n, p);
 switch (aurb-urb.status) {
 case 0:
 p-len = aurb-urb.actual_length;
@@ -280,11 +286,12 @@
 p-len = USB_RET_NAK;
 break;
 }
-
+	dprintf(husb: completing, p-len=%d\n, p-len);
 usb_packet_complete(p);
 	}
 
 async_free(aurb);
+
 }
 }
 


Re: [Qemu-devel] [PATCH] rev3: support colon in filenames

2009-07-16 Thread Jan Kiszka
Anthony Liguori wrote:
 Jan Kiszka wrote:
 We would still have to deal with the fact that so far '\' had no special
 meaning on Windows - except that is was the well-known path separator.
 So redefining its meaning would break a bit...
   
 
 That's the problem.  You will break existing Windows users.
 
 I know this goes against the current momentum in qemu, but overloading
 one option with a bunch of parameters seems absolutely silly to me.

It's surely not perfect in every detail. On the other hand, it's fairly
compact, concentrating device attributes in one place.

Jan



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH] rev3: support colon in filenames

2009-07-16 Thread Jan Kiszka
Anthony Liguori wrote:
 Jamie Lokier wrote:
 So instead of consistency, you like the idea of using different
 quoting rules for the monitor than for command line arguments?
   
 
 Your proposal breaks Windows in a catastrophic way.  It's almost certain
 that all existing front-ends/scripts will stop working after such a change.

Breakage of existing users is surely a no-go, so '\'-escaping remains
taboo for Windows.

 
 Or you have to quote differently on Windows which means you throw
 consistency out the Window.

I'm still not convinced that we actually need that much consistency
here. This is *mostly* about path names, and path names are not directly
portable between Windows and the rest anyway.

Jan



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH] rev5: support colon in filenames

2009-07-16 Thread Ram Pai
On Wed, 2009-07-15 at 18:04 +0300, Blue Swirl wrote:
 On 7/15/09, Ram Pai linux...@us.ibm.com wrote:
  Problem: It is impossible to feed filenames with the character colon because
   qemu interprets such names as a protocol. For example filename scsi:0, is
   interpreted as a protocol by name scsi.
 
   --- a/block/raw-posix.c
   +++ b/block/raw-posix.c
   +static int qemu_open(const char *filename, int flags, ...)
 
   --- a/block/raw-win32.c
   +++ b/block/raw-win32.c
   +fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
 
 I bet this won't compile on win32.

yes. good catch. fix is in the next revision(rev 6). However I do not
have a setup to compile and test changes in win32-raw.c . I will have to
rely on somebody to do the testing. 

RP



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qcow2 relative paths

2009-07-16 Thread Kevin Wolf
Ram Pai schrieb:
 On Wed, 2009-07-15 at 22:04 +0100, Jamie Lokier wrote:
 What an unhelpful error message...  There isn't even a way to find out
 the backing file path which the tool is looking for.
 
 Ok. i have introduced a message towards the effect, in the next revision
 of the patch.  Hope that will make things a little easier.

Please don't include it here - one patch for one problem. I agree that
this error message isn't exactly helpful, but it must be fixed in a
different patch.

Kevin
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] rev6: support colon in filenames

2009-07-16 Thread Ram Pai
Problem: It is impossible to feed filenames with the character colon because
qemu interprets such names as a protocol. For example filename scsi:0, is
interpreted as a protocol by name scsi.

This patch allows user to espace colon characters. For example the above
filename can now be expressed either as 'scsi\:0' or as file:scsi:0

anything following the file: tag is interpreted verbatin. However if file:
tag is omitted then any colon characters in the string must be escaped using
backslash.

Here are couple of examples:

scsi\:0\:abc is a local file scsi:0:abc
http\://myweb is a local file by name http://myweb
file:scsi:0:abc is a local file scsi:0:abc
file:http://myweb is a local file by name http://myweb

fat:c:\path\to\dir\:floppy\:  is a fat file by name \path\to\dir:floppy:
NOTE:The above example cannot be expressed using the file: protocol.


Changelog w.r.t to iteration 0:
   1) removes flexibility added to nbd semantics  eg -- nbd:\::
   2) introduce the file: protocol to indicate local file

Changelog w.r.t to iteration 1:
   1) generically handles 'file:' protocol in find_protocol
   2) centralizes 'filename' pruning before the call to open().
   3) fixes buffer overflow seen in fill_token()
   4) adheres to codying style
   5) patch against upstream qemu tree

Changelog w.r.t to iteration 2:
   1) really really fixes buffer overflow seen in 
fill_token() (if not, beat me :)
   2) the centralized 'filename' pruning had a side effect with
qcow2 files and other files. Fixed it. _open() is back.

Changelog w.r.t to iteration 3:
   1) support added to raw-win32.c (i do not have the setup to 
test this change. Request help with testing)
   2) ability to espace option-values containing commas using 
backslashes 
eg  file=file:abc,,  can also be expressed as file=file:abc\, 
where 'abc,' is a filename
   3) fixes a bug (reported by Jan Kiszka) w.r.t support for -snapshot
   4) renamed _open() to qemu_open() and removed dependency on PATH_MAX

Changelog w.r.t to iteration 4:
   1) applies to upstream qemu tree

Changelog w.r.t to iteration 5:
   1) fixed a issue with backing_filename for qcow2 files,
reported by Jamie Lokier.
   2) fixed a compile issue with win32-raw.c reported by Blue Swirl.
(I do not have the setup to test win32 changes. 
 Request help with testing)

Signed-off-by: Ram Pai linux...@us.ibm.com

 block.c   |   39 
 block/raw-posix.c |   15 
 block/raw-win32.c |   26 --
 block/vvfat.c |   97 -
 cutils.c  |   46 +++
 qemu-common.h |2 +
 qemu-option.c |8 -
 8 files changed, 196 insertions(+), 37 deletions(-)

diff --git a/block.c b/block.c
index cefbe77..4423c72 100644
--- a/block.c
+++ b/block.c
@@ -225,7 +225,6 @@ static BlockDriver *find_protocol(const char *filename)
 {
 BlockDriver *drv1;
 char protocol[128];
-int len;
 const char *p;
 
 #ifdef _WIN32
@@ -233,14 +232,9 @@ static BlockDriver *find_protocol(const char *filename)
 is_windows_drive_prefix(filename))
 return bdrv_find_format(raw);
 #endif
-p = strchr(filename, ':');
-if (!p)
+p = prune_strcpy(protocol, sizeof(protocol), filename, ':');
+if (*p != ':')
 return bdrv_find_format(raw);
-len = p - filename;
-if (len  sizeof(protocol) - 1)
-len = sizeof(protocol) - 1;
-memcpy(protocol, filename, len);
-protocol[len] = '\0';
 for(drv1 = first_drv; drv1 != NULL; drv1 = drv1-next) {
 if (drv1-protocol_name 
 !strcmp(drv1-protocol_name, protocol))
@@ -331,7 +325,6 @@ int bdrv_open2(BlockDriverState *bs, const char *filename, 
int flags,
 {
 int ret, open_flags;
 char tmp_filename[PATH_MAX];
-char backing_filename[PATH_MAX];
 
 bs-read_only = 0;
 bs-is_temporary = 0;
@@ -343,7 +336,6 @@ int bdrv_open2(BlockDriverState *bs, const char *filename, 
int flags,
 if (flags  BDRV_O_SNAPSHOT) {
 BlockDriverState *bs1;
 int64_t total_size;
-int is_protocol = 0;
 BlockDriver *bdrv_qcow2;
 QEMUOptionParameter *options;
 
@@ -359,25 +351,15 @@ int bdrv_open2(BlockDriverState *bs, const char 
*filename, int flags,
 }
 total_size = bdrv_getlength(bs1)  SECTOR_BITS;
 
-if (bs1-drv  bs1-drv-protocol_name)
-is_protocol = 1;
-
 bdrv_delete(bs1);
 
 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
 
-/* Real path is meaningless for protocols */
-if (is_protocol)
-snprintf(backing_filename, sizeof(backing_filename),
- %s, filename);
-else
-realpath(filename, backing_filename);
-
 bdrv_qcow2 = bdrv_find_format(qcow2);
 options = parse_option_parameters(, 

Re: [Qemu-devel] [PATCH] rev5: support colon in filenames

2009-07-16 Thread Kevin Wolf
Ram Pai schrieb:
 On Wed, 2009-07-15 at 18:04 +0300, Blue Swirl wrote:
 On 7/15/09, Ram Pai linux...@us.ibm.com wrote:
 Problem: It is impossible to feed filenames with the character colon because
  qemu interprets such names as a protocol. For example filename scsi:0, is
  interpreted as a protocol by name scsi.
  --- a/block/raw-posix.c
  +++ b/block/raw-posix.c
  +static int qemu_open(const char *filename, int flags, ...)
  --- a/block/raw-win32.c
  +++ b/block/raw-win32.c
  +fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
 I bet this won't compile on win32.
 
 yes. good catch. fix is in the next revision(rev 6). However I do not
 have a setup to compile and test changes in win32-raw.c . I will have to
 rely on somebody to do the testing. 

It's not that complicated to set up a mingw cross build environment.
Have you tried that? At least it would help you to catch compile errors.
(And I usually run it in Wine then to check that it's not completely broken)

Kevin
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/10] Move IO APIC to its own lock.

2009-07-16 Thread Gleb Natapov
On Wed, Jul 15, 2009 at 06:38:48PM -0300, Marcelo Tosatti wrote:
 On Wed, Jul 15, 2009 at 11:48:17PM +0300, Gleb Natapov wrote:
+   spin_unlock(ioapic-lock);
+   kvm_notify_acked_irq(ioapic-kvm, KVM_IRQCHIP_IOAPIC, 
i);
+   spin_lock(ioapic-lock);
   
   While traversing the IOAPIC pins you drop the lock and acquire it again,
   which is error prone: what if the redirection table is changed in
   between, for example?
   
  Yes, the ack notifiers is a problematic part. The only thing that
  current ack notifiers do is set_irq_level(0) so this is not the problem
  in practice. I'll see if I can eliminate this dropping of the lock here.
 
 Currently, yes. But take into account that an ack notifier might do 
 other things than set_irq_level(0).
 
For instance? Ack notifier can do whatever it wants if it does not call
back into ioapic. It may call to ioapic only by set_irq_level(0) (which
it does now from device assignment code) or by set_irq_level(1) and this
does not make sense for level triggered interrupts because not calling
set_irq_level(1) will have the same effect.

But I think I found the way to not drop the lock here.

 Say for example an ack notifier that takes the PIC or IOAPIC lock 
 (for whatever reason).
What reason? No one should take PIC or IOAPIC lock outside of PIC or
IOAPIC code. This is layering violation. IOAPIC should be accessed only
through its public interface (set_irq/mmio_read|write/update_eoi)

 
   Also, irq_lock was held during ack and mask notifier callbacks, so they
   (the callbacks) did not have to worry about concurrency.
   
  Is it possible to have more then one ack for the same interrupt line at
  the same time?
 
 Why not? But maybe this is a somewhat stupid point, because you can
 require the notifiers to handle that.
If this is possible (and it looks like it may happen if IOAPIC broadcasts
an interrupt vector to more then one vcpu) then it may be better to push
complexity into an ack notifier to not penalize a common scenario.
But again, if we will not drop the lock around ack notifier call they
will be serialized.

 
   You questioned the purpose of irq_lock earlier, and thats what it is:
   synchronization between pic and ioapic blur at the irq notifiers.
   
  Pic has its own locking and it fails miserably in regards ack notifiers.
  I bet nobody tried device assignment with pic. It will not work.
 
 It fails to take irq_lock on automatic EOI because on guest entry 
 interrupts are disabled (see d5ecfdd25c412df9c0ecf3ab8e066461fd4f69df).
This explains why the code is buggy, but does not fix the bug. There are
two bugs at least with the pic + ack notifiers. The first one is that
irq notifiers are called without irq_lock held and that will trigger
WARN_ON(!mutex_is_locked(kvm-irq_lock)) in kvm_set_irq() in device
assignment case (besides what is the point of a lock if it is not taken
on every code path?). Another one is that you can't postpone call to ack
notifiers in kvm_pic_read_irq() till after pic_update_irq(s). The reason
is that pic_update_irq() will trigger acked interrupt immediately since
ack notifier is the one who lowers irq line in device assignment case
(that is the reason I haven't done the same in ioapic case BTW).

 
  irq_lock is indeed used to synchronization ioapic access, but the way
  it is done requires calling kvm_set_irq() with lock held and this adds
  unnecessary lock on a msi irq injection path.
  
   Using RCU to synchronize ack/mask notifier list walk versus list
   addition is good, but i'm not entirely sure that smaller locks like you
   are proposing is a benefit long term (things become much more tricky).
  Without removing irq_lock RCU is useless since the list walking is always
  done with irq_lock held. I see smaller locks as simplification BTW. The
  thing is tricky now, or so I felt while trying to understand what
  irq_lock actually protects. With smaller locks with names that clearly
  indicates which data structure it protects I feel the code is much
  easy to understand.
 
 What is worrying is if you need to take both PIC and IOAPIC locks for 
 some operation (then have to worry about lock ordering etc).
 
Lets not worry about far fetched theoretical cases especially since you
have the same problem now. What if you'll need to take both pic lock and
irq_lock? Except that this is not so theoretical because we need to
take them both with current code we just don't do it.

   Maybe it is the only way forward (and so you push this complexity to the
   ack/mask notifiers), but i think it can be avoided until its proven that
   there is contention on irq_lock (which is reduced by faster search).
  This is not about contention. This is about existence of the lock in the
  first place. With the patch set there is no lock at msi irq injection
  path at all and this is better than having lock no matter if the lock is
  congested or not. There is still a lock on ioapic path (which MSI 

Re: [Qemu-devel] [PATCH] rev3: support colon in filenames

2009-07-16 Thread Kevin Wolf
Jamie Lokier schrieb:
 Kevin Wolf wrote:
 Can we at least allow \, instead of ,, in parameter parsing, so that the
 backslash has the practical benefit of being a single universal escape
 character?
 
 Is there a good reason why we cannot simply use \char to escape
 _any_ character, in every context where a user-supplied
 string/name/path/file is used?
 
 I'm thinking of consistency here.  Instead of special cases for
 filenames, why not a standard scheme for all the places in command
 lines _and_ the monitor where a name/path/file is needed?

I absolutely agree with your intention here (maybe except Windows,
haven't thought about that a lot).

But from an implementation POV, this would need a major rework of the
parsing code. The problem is that to do this universally you need to
have one central place where everything is parsed. We currently don't
have that.

We have the command line parser that needs comma and equals for its
parsing. We have the block code that needs the colon for protocols. We
have block drivers that again separate options by colons. And so on.

So currently we can't handle backslashes when parsing command line
options. They would be missing in the block code for escaping colons. We
can't handle all colons in the generic block code, the stripped
backslashes would be missing in vvfat and nbd.

Once we have decided what the solution should look like (including
Windows and other problems), it might be worth the effort. But I can
promise that it's going to be much more than just one patch.

Kevin
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Autotest] [PATCH] Assign an UUID for each VM in kvm command line

2009-07-16 Thread Michael Goldish

- sudhir kumar smalik...@gmail.com wrote:

 On Thu, Jul 16, 2009 at 8:12 AM, Yolkfull Chowyz...@redhat.com
 wrote:
  On 07/15/2009 09:36 PM, Dor Laor wrote:
  On 07/15/2009 12:12 PM, Yolkfull Chow wrote:
  Would submit this patch which is from our internal kvm-autotest
  patches submitted by Jason.
  So that we could go on test case about parameters
 verification(UUID,
  DMI data etc).
 
  Signed-off-by: Yolkfull Chowyz...@redhat.com
  ---
    client/tests/kvm/kvm_vm.py |    4 
    1 files changed, 4 insertions(+), 0 deletions(-)
 
  diff --git a/client/tests/kvm/kvm_vm.py
 b/client/tests/kvm/kvm_vm.py
  index 503f636..68cc235 100644
  --- a/client/tests/kvm/kvm_vm.py
  +++ b/client/tests/kvm/kvm_vm.py
  @@ -287,6 +287,10 @@ class VM:
            elif params.get(display) == nographic:
                qemu_cmd +=  -nographic
 
  +        uuid = os.popen(cat
  /proc/sys/kernel/random/uuid).readline().strip()
  +        if uuid:
  +            qemu_cmd +=  -uuid %s % uuid
 
  If you'll change the uuid on every run, the guest will notice
 that.
  Some guest (M$) might not love it.
  Why not use a static uuid or even just test uuid in a specific
 test
  without having it in all tests?
  Hi Dor, since we cannot use a static uuid for running stress_boot
 test,
  but just assign UUID in a specific test is a good idea. We could use
 an
  option like assign_uuid = yes for that specific test?
 
 This will be far better and more flexible.

Why not allow both static:

uuid = 7032808e-7d90-4921-b95d-fa0e11c9659c

and random:

uuid = random

If uuid == random, VM.create() will generate a random one. Otherwise
it will use the one given.

BTW, the code that generates a random uuid should be in VM.create(),
not in VM.make_qemu_command(). VM.make_qemu_command() should be
deterministic. VM.create() should store the uuid in an attribute
(e.g. self.uuid = ...) and then VM.make_qemu_command() should use this
attribute. This is important because we use VM.make_qemu_command() to
see if the VM needs to be restarted, and if the uuid changes every time
VM.make_qemu_command() is called, the VM will always be restarted,
even if no parameters change.
(When running a test, we normally use the existing VM without killing
and restarting it; we restart only if the qemu command of the new test
is different, or if the user explicitly requested a restart via the
config file.)

  btw: why you're at it, please add uuid to the block devices too.
  + the -smbios option.
  Do you mean assign serial number for block devices?
 
  Thanks for suggestions. :)
 
  Thanks,
  dor
 
  +
            return qemu_cmd
 
 
 
 
 
  --
  Yolkfull
  Regards,
 
  ___
  Autotest mailing list
  autot...@test.kernel.org
  http://test.kernel.org/cgi-bin/mailman/listinfo/autotest
 
 
 
 
 -- 
 Sudhir Kumar
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Autotest] [RFC] KVM-Autotest: remote shell utility for Windows guests

2009-07-16 Thread Michael Goldish

- sudhir kumar smalik...@gmail.com wrote:

 Thats great Michael !!
 
 On Wed, Jul 15, 2009 at 8:56 PM, Michael Goldishmgold...@redhat.com
 wrote:
 
  - Lucas Meneghel Rodrigues l...@redhat.com wrote:
 
  On Wed, Jul 8, 2009 at 4:46 AM, Michael
 Goldishmgold...@redhat.com
  wrote:
   I'm resending the message because it probably got filtered out
 due
  to the
   attached setup.bat file.
  
   The contents of setup.bat were:
  
   copy D:\rss.exe C:\
  
   net user Administrator /active:yes
   net user Administrator 1q2w3eP
   netsh firewall set opmode disable
  
   reg add HKLM\Software\Microsoft\Windows\CurrentVersion\Run /v
  Remote Shell Server /d C:\rss.exe 22 /t REG_SZ /f
   reg add HKLM\Software\Microsoft\Windows
 NT\CurrentVersion\winlogon
  /v AutoAdminLogon /d 1 /t REG_SZ /f
   reg add HKLM\Software\Microsoft\Windows
 NT\CurrentVersion\winlogon
  /v DefaultUserName /d Administrator /t REG_SZ /f
   reg add HKLM\Software\Microsoft\Windows
 NT\CurrentVersion\winlogon
  /v DefaultPassword /d 1q2w3eP /t REG_SZ /f
  
   - Original Message -
  
   Hi,
  
   In an attempt to solve the SSH problems we have with Windows, I
  wrote a little
   remote shell utility to replace the OpenSSH server we're
 currently
  using with
   Win(2000|XP|2003) and the builtin Telnet server we're using with
  Win2008.
   It also works with Vista, for which we currently have no other
  solution.
 
  This is a very welcome addition to our test tools, for sure.
 
  
   Features:
   - Listens on a requested port (22 by default).
   - Provides clients with a cmd.exe shell.
 So does it export the full environment of cmd.exe ? I meant can we
 run
 all the commands here as we can do in a direct cmd.exe in a windows
 system?

Yes, but some commands need special care: GUI commands should be
executed with cmd /c (e.g. cmd /c calc) and some interactive commands
are not fully interactive (like ftp.exe). See comments below (in the
original message).

   - Multiple clients can connect simultaneously.
   - Uses no authentication whatsoever.
   - Uses standard API calls that should work on all modern Windows
  variants.
   - Displays an informative message box if any API call fails.
   - Automatically closes all processes started by a client when it
  disconnects.
   - Allows clients to run GUI programs (see comment below).
   - Starts minimized by default so it doesn't interfere with step
 file
  tests.
   - Reports all activity in a text box.
   - The code is short (450 lines including comments).
 
  I definitely like that :)
 
  
   Tested and verified to work on WinXP 32, 2003 32, Vista 32 and
 64,
  2008 32.
   I haven't tested it on other Windows versions but it should work
  (should be
 I can give a quick testing on some of the datacentres if I can get the
 binaries.
 
   interesting to test it on Windows 7).
  
   The source code is attached.  I used MinGW (with Code::Blocks)
 to
  compile it
   under WinXP.  Link it with ws2_32.lib.  If anyone wants the
 binary
  please let
   me know and I'll send it somehow (I don't know if I'm supposed
 to
  send
   binaries to the list).
 Yes please.
 May be in future the binaries can be hosted somewhere and we can add
 the steps files to do an automatic install of the ssh server on the
 windows guests ?

I'll try sending you a binary in a few minutes.

   Problems:
   - cmd.exe echoes back the command line sent to it.  This means
  commands are
   echoed twice: first by the local terminal and then by the remote
  cmd.exe.
   So if you send ver\r\n to the server you get:
   ver\r\nver\r\nMicrosoft Windows [Version ...]\r\nC:\\
   In order for it to work with Autotest we'll have to make some
  modifications
   to kvm_utils.kvm_spawn (which should be replaced by
 kvm_subprocess
  anyway) --
   specifically disable the local terminal echo, and not assume
 that
  the command
   line is echoed exactly once (it may not be echoed at all by
 Linux
  guests).
  
   - The server does not send or receive files.  For now we can
  transfer files
   into Windows guests using ISOs (-cdrom).  If it turns out to be
  necessary, file
   send/receive support can be implemented into the shell server, or
 we
  can use an
   open source Windows FTP server or find some other solution.
 
  Yes, a remote copy utility would be good, in order to keep
  consistency.
 That too if it can be implemented in the same shell server will be
 good in comparison to relying on multiple tools/utilities.

I think it should be rather easy to implement, but until it's implemented
we can certainly use -cdrom. Currently there are no Windows tests that
involve sending or receiving files, and -cdrom is very easy to use in
Windows because it normally maps to D:\ (except when there are multiple
hard drives). -cdrom is also safer and quicker to use than sending files
because it avoids possible permission problems that need to be fixed
with Windows' cacls (a shell command). So we should probably send/receive
files only when we 

Re: [PATCH] Fix non-KVM build

2009-07-16 Thread Jan Kiszka
Michael S. Tsirkin wrote:
 On Wed, Jun 24, 2009 at 01:13:46PM -0500, Anthony Liguori wrote:
 This introduces some #ifdefs in pcspk to fix the build when KVM isn't 
 enabled.

 Signed-off-by: Anthony Liguori aligu...@us.ibm.com
 ---
  hw/pcspk.c |   15 +--
  1 files changed, 9 insertions(+), 6 deletions(-)

 diff --git a/hw/pcspk.c b/hw/pcspk.c
 index 9e1b59a..236995a 100644
 --- a/hw/pcspk.c
 +++ b/hw/pcspk.c
 @@ -80,11 +80,6 @@ static void kvm_set_pit_ch2(PITState *pit,
  kvm_set_pit(kvm_context, inkernel_state);
  }
  }
 -#else
 -static inline void kvm_get_pit_ch2(PITState *pit,
 -   kvm_pit_state *inkernel_state) { }
 -static inline void kvm_set_pit_ch2(PITState *pit,
 -   kvm_pit_state *inkernel_state) { }
  #endif
  
 
 The version with stubs looks cleaner to me. IMO we really should be
 moving away from ifdefs for features, and only use them for
 compiler-specific things.  If for no other reason, then because it
 increases the common code that is compiled for all platforms,
 decreasing the chance that people submit a patch that does not
 build on soe platform.
 
 Is the issue with struct kvm_pit_state?
 Can't we just stub it out as well?
 
 struct kvm_pit_state {};

It's solved like that in current git. Do you still face problems?

Jan

-- 
Siemens AG, Corporate Technology, CT SE 2
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


failure to build kvm release against 2.6.30

2009-07-16 Thread Or Gerlitz
With kvm-88, a configure line such as (see output below)
# ./configure --prefix=/path/to/intall/dir --target-list=x86_64-softmmu  
--kerneldir=/lib/modules/2.6.30/build

yields the below build failure of the kvm kernel sources (I think that in
earlier version there was a configure directive to disable build the kvm
sources and let me use what's in the kernel, it seems to be gone now, why?)

on the same system kvm-88 builds fine against 2.6.29.1, the same problem
happens also with kvm-86 and 87

Or.

[kvm-88]# make
for d in pc-bios/optionrom; do \
make -C $d || exit 1 ; \
done
make[1]: Entering directory
`/home/ogerlitz/kvm/src/kvm-88/pc-bios/optionrom'
make[1]: Nothing to be done for `all'.
make[1]: Leaving directory
`/home/ogerlitz/kvm/src/kvm-88/pc-bios/optionrom'
make -C /lib/modules/2.6.30/build M=`pwd` \
LINUXINCLUDE=-I`pwd`/include -Iinclude \
 -Iarch/x86/include -I`pwd`/include-compat \
-include include/linux/autoconf.h \
-include `pwd`/x86/external-module-compat.h  \
$@
  CC [M]  /home/ogerlitz/kvm/src/kvm-88/kvm/kernel/x86/svm.o
In file included from include/linux/gfp.h:4,
 from include/linux/kmod.h:22,
 from include/linux/module.h:13,
 from include/linux/sysdev.h:25,
 from include/linux/cpu.h:22,
 from 
/home/ogerlitz/kvm/src/kvm-88/kvm/kernel/x86/../external-module-compat-comm.h:15,
 from 
/home/ogerlitz/kvm/src/kvm-88/kvm/kernel/x86/external-module-compat.h:16,
 from command line:1:
include/linux/mmzone.h:18:26: error: linux/bounds.h: No such file or directory
include/linux/mmzone.h:256:5: warning: MAX_NR_ZONES is not defined
In file included from include/linux/gfp.h:4,
 from include/linux/kmod.h:22,
 from include/linux/module.h:13,
 from include/linux/sysdev.h:25,
 from include/linux/cpu.h:22,
 from 
/home/ogerlitz/kvm/src/kvm-88/kvm/kernel/x86/../external-module-compat-comm.h:15,
 from 
/home/ogerlitz/kvm/src/kvm-88/kvm/kernel/x86/external-module-compat.h:16,
 from command line:1:
include/linux/mmzone.h:290: error: ‘MAX_NR_ZONES’ undeclared here (not in a 
function)
In file included from 
/home/ogerlitz/kvm/src/kvm-88/kvm/kernel/x86/../external-module-compat-comm.h:312,
 from 
/home/ogerlitz/kvm/src/kvm-88/kvm/kernel/x86/external-module-compat.h:16,
 from command line:1:
include/linux/mm.h:446:63: warning: NR_PAGEFLAGS is not defined
include/linux/mm.h:494:62: warning: NR_PAGEFLAGS is not defined
make[4]: *** [/home/ogerlitz/kvm/src/kvm-88/kvm/kernel/x86/svm.o] Error 1
make[3]: *** [/home/ogerlitz/kvm/src/kvm-88/kvm/kernel/x86] Error 2
make[2]: *** [_module_/home/ogerlitz/kvm/src/kvm-88/kvm/kernel] Error 2
make[1]: *** [all] Error 2
make: *** [kvm-kmod] Error 2

# ./configure --prefix=/path/to/intall/dir --target-list=x86_64-softmmu  
--kerneldir=/lib/modules/2.6.30/build


Install prefix/home/ogerlitz/kvm/install/kvm-88
BIOS directory/home/ogerlitz/kvm/install/kvm-88/share/qemu
binary directory  /home/ogerlitz/kvm/install/kvm-88/bin
Manual directory  /home/ogerlitz/kvm/install/kvm-88/share/man
ELF interp prefix /usr/gnemul/qemu-%M
Source path   /home/ogerlitz/kvm/src/kvm-88
C compilergcc
Host C compiler   gcc
ARCH_CFLAGS   -m64
make  make
install   install
host CPU  x86_64
host big endian   no
target list   x86_64-softmmu
tcg debug enabled no
gprof enabled no
sparse enabledno
strip binariesyes
profiler  no
static build  no
-Werror enabled   no
SDL support   yes
SDL static link   yes
curses supportyes
curl support  yes
mingw32 support   no
Audio drivers oss
Extra audio cards ac97 es1370 sb16
Mixer emulation   no
VNC TLS support   yes
TLS CFLAGS
TLS LIBS  -lgnutls
VNC SASL support  yes
SASL CFLAGS
SASL LIBS  -lsasl2
kqemu support no
xen support   no
CPU emulation yes
brlapi supportno
Documentation yes
NPTL support  no
vde support   no
AIO support   yes
IO thread no
Install blobs yes
KVM support   yes
KVM trace support no
fdt support   no
preadv supportno


Re: Fix migration issue when the destination is loaded

2009-07-16 Thread Mark McLoughlin
Hi Dor,

On Wed, 2009-07-15 at 18:03 +0300, Dor Laor wrote:
 If the migration socket is full, we get EAGAIN for the write.
 The set_fd_handler2 defers the write for later on. The function
 tries to wake up the iothread by qemu_kvm_notify_work.
 Since this happens in a loop, multiple times, the pipe that emulates
 eventfd becomes full and we get a deadlock.

I'm not sure I follow:

  - You're seeing qemu_kvm_notify_work() being called many times

  - The call chain is migrate_fd_put_buffer(), qemu_set_fd_handler2(), 
main_loop_break()

  - This happens when write() in migrate_fd_put_buffer() returns EAGAIN 
because the socket buffer has filled up

Correct?

That sounds like migrate_fd_put_buffer() is being called repeatedly
while we know the socket isn't writable?

Shouldn't the buffered file could stop attempting to call put_buffer()
until it has been notified that the underlying fd is writable?

Cheers,
Mark.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH] rev5: support colon in filenames

2009-07-16 Thread Amit Shah
On (Wed) Jul 15 2009 [10:40:37], Anthony Liguori wrote:
 Blue Swirl wrote:
 Then how about something like:
  -drive name=hda,if=ide,cache=off,file_is_arg -filearg foo.img
  -drive name=vda,if=virtio,cache=writeback,file_comes_next  -patharg  foo.img
  -drive name=sdb,if=scsi,unit=1,fnarg -fnarg boo.img
   

 The explicit ordering part seems clunky to me.  How about:

 -drive name=vda,if=virtio -drive.vda.file filename.img

 What's nice about this syntax is it generalizes well.  You could have:

 -drive.vda.if virtio -drive.vda.file filename.img

How would you differentiate between multiple files? For example,
filename1.img should be the boot drive and filename2.img should be the
secondary drive.

Amit
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Qemu 0.10.50 CAN NOT get ip from dhcp server

2009-07-16 Thread Amit Shah
On (Wed) Jul 15 2009 [16:54:15], John Wong wrote:
 Yes, kvm-88 fixed this problem.

 I also notice another problem, when i use kvm-88(qemu-system-x86_64 with  
 kvm-kmod-2.6.30.1-rc2.tar.gz) installing 64bit OS (window7  debian   
 netbsd),
 the install CD show the error message, say this system is 32bit not  
 64bit and can not continue to install.
 I did try use -cpu qemu64 or -cpu core2duo, but no help.

Did you run 'make install' after compiling kvm-88?

Amit
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Qemu 0.10.50 CAN NOT get ip from dhcp server

2009-07-16 Thread John Wong

Amit Shah 提到:

On (Wed) Jul 15 2009 [16:54:15], John Wong wrote:
  

Yes, kvm-88 fixed this problem.

I also notice another problem, when i use kvm-88(qemu-system-x86_64 with  
kvm-kmod-2.6.30.1-rc2.tar.gz) installing 64bit OS (window7  debian   
netbsd),
the install CD show the error message, say this system is 32bit not  
64bit and can not continue to install.

I did try use -cpu qemu64 or -cpu core2duo, but no help.



Did you run 'make install' after compiling kvm-88?

Amit
  

Yes, i did it.
./configure --prefix=/opt/kvm-88/ --audio-drv-list=alsa oss
make clean
make
sudo make install

Please help, thank you.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Specify the system UUID for VM

2009-07-16 Thread Yolkfull Chow
On Thu, Jul 16, 2009 at 06:26:46PM +0800, Yolkfull Chow wrote:
 
 Signed-off-by: Yolkfull Chow yz...@redhat.com
 ---
  client/tests/kvm/kvm_vm.py |   11 +++
  1 files changed, 11 insertions(+), 0 deletions(-)
 
 diff --git a/client/tests/kvm/kvm_vm.py b/client/tests/kvm/kvm_vm.py
 index 503f636..895049e 100644
 --- a/client/tests/kvm/kvm_vm.py
 +++ b/client/tests/kvm/kvm_vm.py
 @@ -113,6 +113,13 @@ class VM:
  self.qemu_path = qemu_path
  self.image_dir = image_dir
  self.iso_dir = iso_dir
 +
 +if params.get(uuid):
 +if params.get(uuid) == random:
 +uuid = os.popen(cat 
 /proc/sys/kernel/random/uuid).readline()
 +self.uuid = uuid.strip()
 +else:
 +self.uuid = params.get(uuid)

Sorry, forgot to initialize self.uuid.  Will resend the patch. 

  
  
  # Find available monitor filename
 @@ -374,6 +381,10 @@ class VM:
  # Make qemu command
  qemu_command = self.make_qemu_command()
  
 +# Specify the system UUID
 +if self.uuid:
 +qemu_command +=  -uuid %s % self.uuid
 +
  # Is this VM supposed to accept incoming migrations?
  if for_migration:
  # Find available migration port
 -- 
 1.6.2.5
 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Specify the system UUID for VM

2009-07-16 Thread Yolkfull Chow

Signed-off-by: Yolkfull Chow yz...@redhat.com
---
 client/tests/kvm/kvm_vm.py |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/client/tests/kvm/kvm_vm.py b/client/tests/kvm/kvm_vm.py
index 503f636..5f81965 100644
--- a/client/tests/kvm/kvm_vm.py
+++ b/client/tests/kvm/kvm_vm.py
@@ -113,6 +113,14 @@ class VM:
 self.qemu_path = qemu_path
 self.image_dir = image_dir
 self.iso_dir = iso_dir
+
+self.uuid = None
+if params.get(uuid):
+if params.get(uuid) == random:
+uuid = os.popen(cat /proc/sys/kernel/random/uuid).readline()
+self.uuid = uuid.strip()
+else:
+self.uuid = params.get(uuid)
 
 
 # Find available monitor filename
@@ -374,6 +382,10 @@ class VM:
 # Make qemu command
 qemu_command = self.make_qemu_command()
 
+# Specify the system UUID
+if self.uuid:
+qemu_command +=  -uuid %s % self.uuid
+
 # Is this VM supposed to accept incoming migrations?
 if for_migration:
 # Find available migration port
-- 
1.6.2.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Fix migration issue when the destination is loaded

2009-07-16 Thread Dor Laor

On 07/16/2009 12:39 PM, Mark McLoughlin wrote:

Hi Dor,

On Wed, 2009-07-15 at 18:03 +0300, Dor Laor wrote:

If the migration socket is full, we get EAGAIN for the write.
The set_fd_handler2 defers the write for later on. The function
tries to wake up the iothread by qemu_kvm_notify_work.
Since this happens in a loop, multiple times, the pipe that emulates
eventfd becomes full and we get a deadlock.


I'm not sure I follow:

   - You're seeing qemu_kvm_notify_work() being called many times

   - The call chain is migrate_fd_put_buffer(), qemu_set_fd_handler2(),
 main_loop_break()

   - This happens when write() in migrate_fd_put_buffer() returns EAGAIN
 because the socket buffer has filled up

Correct?

That sounds like migrate_fd_put_buffer() is being called repeatedly
while we know the socket isn't writable?

Shouldn't the buffered file could stop attempting to call put_buffer()
until it has been notified that the underlying fd is writable?


There are two fds here:
The one returning EAGAIN, is the migration socket. That's why 
migrate_fd_put_buffer is called and a call back is rescheduled by 
qemu_set_fd_handler2.


In the procedure of qemu_set_fd_handler2, the main_loop_break is called. 
It needs to notify the iothread. It does is by qemu_eventfd, since it is 
being emulated on older kernels, we use a pipe.


The pipe fd is the one that blocks.
I though of setting it to non-blocking but we might get partial writes 
with a non blocking fd (in theory) too.





Cheers,
Mark.



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Qemu 0.10.50 CAN NOT get ip from dhcp server

2009-07-16 Thread Amit Shah
On (Thu) Jul 16 2009 [19:44:42], John Wong wrote:
 Amit Shah 提到:
 On (Wed) Jul 15 2009 [16:54:15], John Wong wrote:
   
 Yes, kvm-88 fixed this problem.

 I also notice another problem, when i use kvm-88(qemu-system-x86_64 
 with  kvm-kmod-2.6.30.1-rc2.tar.gz) installing 64bit OS (window7  
 debian   netbsd),
 the install CD show the error message, say this system is 32bit not   
 64bit and can not continue to install.
 I did try use -cpu qemu64 or -cpu core2duo, but no help.
 

 Did you run 'make install' after compiling kvm-88?

  Amit
   
 Yes, i did it.
 ./configure --prefix=/opt/kvm-88/ --audio-drv-list=alsa oss
 make clean
 make
 sudo make install

 Please help, thank you.

Any messages in the host kernel's logs? Also can you try running a
32-bit VM and report what /proc/cpuinfo shows? I'm assuming you're
running on a 64 bit host. Can you confirm that?


Amit
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: slow guest performance with build load, looking for ideas

2009-07-16 Thread Jes Sorensen

On 07/12/2009 10:38 AM, Avi Kivity wrote:

On 07/09/2009 09:01 PM, Erik Jacobson wrote:

Test runs after make clean...
time (make -j12 make -j12 modules)

real 10m25.585s
user 26m36.450s
sys 8m14.776s

2nd trial (make clean followed by the same test again.
real 9m21.626s
user 26m42.144s
sys 8m14.532s


That's a scaling of 3.7, still pretty far from the host and even farther
than my results.

Is the numa factor of this machine larger than usual?


I didn't see a reply to this one, so I will just add what I know.

I believe Erik ran the tests on what we sell as an XE270 system. It's
really just a standard Intel or Supermicro motherboard in a box that
has been painted purple (or blue/green now I guess), so it really
shouldn't have extra numa factors compared to other Nehalem systems.

Cheers,
Jes

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv4] uio: add generic driver for PCI 2.3 devices

2009-07-16 Thread Michael S. Tsirkin
On Wed, Jul 15, 2009 at 03:08:29PM -0700, Greg KH wrote:
 On Wed, Jul 15, 2009 at 11:13:40PM +0300, Michael S. Tsirkin wrote:
  This adds a generic uio driver that can bind to any PCI device.  First
  user will be virtualization where a qemu userspace process needs to give
  guest OS access to the device.
  
  Interrupts are handled using the Interrupt Disable bit in the PCI command
  register and Interrupt Status bit in the PCI status register.  All devices
  compliant to PCI 2.3 (circa 2002) and all compliant PCI Express devices 
  should
  support these bits.  Driver detects this support, and won't bind to devices
  which do not support the Interrupt Disable Bit in the command register.
  
  It's expected that more features of interest to virtualization will be
  added to this driver in the future. Possibilities are: mmap for device
  resources, MSI/MSI-X, eventfd (to interface with kvm), iommu.
  
  Acked-by: Chris Wright chr...@redhat.com
  Signed-off-by: Michael S. Tsirkin m...@redhat.com
  ---
  
  Hans, Greg, please review and consider for upstream.
  
  This is intended to solve the problem in virtualization that shared
  interrupts do not work with assigned devices. Earlier versions of this
  patch have circulated on k...@vger.
 
 How does this play with the pci-stub driver that I thought was written
 to solve this very problem?


AFAIK the problem pci stub was written to solve is simply to bind to a
device. You then have to use another kernel module which looks the
device up with something like pci_get_bus_and_slot to do anything
useful. In particular, for non-shared interrupts, we can disable the
interrupt in the apic. But this does not work well for shared
interrupts. Thus this work.

The uio driver will be used in virtualization scenarious, a couple
of possible ones that have been mentioned on the kvm list are:
- device assignment (guest access to device) for simple devices with
  shared interrupts: emulating PCI is tricky enough to better be done in
  userspace. shared interrupt support is important as it happens
  with real devices
- simple communication between guest and host:
  we create a virtual device in host, and userspace
  driver in guest gets events and passes them on
  to e.g. dbus. shared interrupt support is important
  to avoid wasting irqs


  Will it conflict?

No in a sense that you can't bind both drivers to the same device.

 In fact, it looks like you copied the comments for this driver directly
 from the pci-stub driver :)

Right.

 How about moving that documentation into a place that people will notice
 it, like the rest of the UIO documentation?

OK.

 And right now you are just sending the irq to userspace, what is
 userspace supposed to do with it?

Userspace uses libpci (i.e. pci sysfs) to talk to the device and to
re-enable interrupts by writing to the command register.

In the case of device assignment, this will be qemu which
acts as a proxy for driver running in guest context.
In case of uio loaded in guest, the driver will be in guest
userspace, and the device is emulated in qemu.


 Do you have a userspace program that
 uses this interface today to verify that everything works?  If so, care
 to provide a pointer to it?

Sure. I used an emulated device for this.
First, you patch qemu to add the device:
http://www.linux-kvm.org/downloads/mst/test_irq.patch

Now, run with the new kernel (-kernel flag), adding
 -device test-irq

Once in guest, assign the device id
echo 1af4 2009  /sys/bus/pci/drivers/uio_pci_generic/new_id

Compile and run this driver:
http://www.linux-kvm.org/downloads/mst/testuio.c
(it does not use any libraries besides libc,
 so just gcc testuio.c -o testuio)

And now make the device assert interrupts, like this:

while
sleep 1
do
setpci -s 00:04.0 0x40.B=0x1
done

You should see messages printed as the driver gets interrupts, but no
error messages about missed interrupts.

 thanks,
 
 greg k-h
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Specify the system UUID for VM

2009-07-16 Thread Michael Goldish
I'd do this a little differently, because of the constraints imposed by the 
different purposes of VM.create() and VM.make_qemu_command().

- Yolkfull Chow yz...@redhat.com wrote:

 Signed-off-by: Yolkfull Chow yz...@redhat.com
 ---
  client/tests/kvm/kvm_vm.py |   11 +++
  1 files changed, 11 insertions(+), 0 deletions(-)
 
 diff --git a/client/tests/kvm/kvm_vm.py b/client/tests/kvm/kvm_vm.py
 index 503f636..895049e 100644
 --- a/client/tests/kvm/kvm_vm.py
 +++ b/client/tests/kvm/kvm_vm.py
 @@ -113,6 +113,13 @@ class VM:
  self.qemu_path = qemu_path
  self.image_dir = image_dir
  self.iso_dir = iso_dir
 +
 +if params.get(uuid):
 +if params.get(uuid) == random:
 +uuid = os.popen(cat
 /proc/sys/kernel/random/uuid).readline()
 +self.uuid = uuid.strip()
 +else:
 +self.uuid = params.get(uuid)

1. First, I'd put this code near the code that finds free ports for the
VM, immediately after the code that find a free VNC port, because this
does a similar job (assigns something dynamic to the VM).
This is not important, it's just a matter of code beauty.

2. Then, I'd change the code to something like:

if params.get(uuid) == random:
uuid = os.popen(cat /proc/sys/kernel/random/uuid).readline()
self.random_uuid = uuid.strip()

Or consider reading the file directly:

if params.get(uuid) == random:
file = open(/proc/sys/kernel/random/uuid)
self.random_uuid = file.read().strip()
file.close()

Not much longer, but a little nicer in my opinion.

  
  # Find available monitor filename
 @@ -374,6 +381,10 @@ class VM:
  # Make qemu command
  qemu_command = self.make_qemu_command()
  
 +# Specify the system UUID
 +if self.uuid:
 +qemu_command +=  -uuid %s % self.uuid
 +

3. Then, this code, in my opinion, should be in VM.make_qemu_command(),
not in VM.create(). You can put it at the very end, after handling
display, or wherever you want.
It should also be changed to something like:

if params.get(uuid) == random:
qemu_cmd +=  -uuid %s % self.random_uuid
elif params.get(uuid):
qemu_cmd +=  -uuid %s % params.get(uuid)

4. You should add the following line to VM.__init__():

self.random_uuid = None

(I just realized that there's a little bug in the existing VM code,
so we'll have to set a few more attributes to None in VM.__init__(),
in addition to random_uuid. I'll post a patch for that.)

5. While you're at it, add a function VM.get_uuid(), which should do
something like:

def get_uuid(self):

(docstring)

if self.params.get(uuid) == random:
return self.random_uuid
else:
return self.params.get(uuid)
# (Note that if there is no uuid in self.params,
# this function will return None, which is good)

This will be useful for a little uuid test Yaniv once suggested --
it will make sure the guest reports the correct uuid.

The rationale behind all this:
VM.make_qemu_command() is often called with altered 'params', to see
if those params would lead to a different qemu command.
If a VM is running with a random uuid, and we call VM.make_qemu_command()
with params that specify uuid = ... (not random), we want the qemu
command to reflect that. If a VM is running with a user-specified uuid,
and we want to make it random, we want the qemu command to change.
However, if a VM is running with a random uuid, and we start a new test
with a random uuid, we don't want to restart the VM with a new random uuid --
we'll just use the existing random uuid.
So, if the uuid is random, it should be generated in create(), and used
in make_qemu_command(). If it's user-specified, it should just be used by
make_qemu_command().

I may have made mistakes in the code in this message, so it'll need a
little testing.

Thanks,
Michael

  # Is this VM supposed to accept incoming migrations?
  if for_migration:
  # Find available migration port
 -- 
 1.6.2.5
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Qemu 0.10.50 CAN NOT get ip from dhcp server

2009-07-16 Thread John Wong

Amit Shah 提到:

On (Thu) Jul 16 2009 [19:44:42], John Wong wrote:
  

Amit Shah 提到:


On (Wed) Jul 15 2009 [16:54:15], John Wong wrote:
  
  

Yes, kvm-88 fixed this problem.

I also notice another problem, when i use kvm-88(qemu-system-x86_64 
with  kvm-kmod-2.6.30.1-rc2.tar.gz) installing 64bit OS (window7  
debian   netbsd),
the install CD show the error message, say this system is 32bit not   
64bit and can not continue to install.

I did try use -cpu qemu64 or -cpu core2duo, but no help.



Did you run 'make install' after compiling kvm-88?

Amit
  
  

Yes, i did it.
./configure --prefix=/opt/kvm-88/ --audio-drv-list=alsa oss
make clean
make
sudo make install

Please help, thank you.



Any messages in the host kernel's logs? Also can you try running a
32-bit VM and report what /proc/cpuinfo shows? I'm assuming you're
running on a 64 bit host. Can you confirm that?


Amit
  

No, i can not found any message is relate kvm.

Yes, i installed Debian/32-bit VM, below is cat /proc/cpuinfo.

Thank your help.

processor: 0
vendor_id: GenuineIntel
cpu family: 6
model: 2
model name: QEMU Virtual CPU version 0.10.50
stepping: 3
cpu MHz: 2999.638
cache size: 2048 KB
fdiv_bug: no
hlt_bug: no
f00f_bug: no
coma_bug: no
fpu: yes
fpu_exception: yes
cpuid level: 2
wp: yes
flags: fpu de pse tsc msr pae mce cx8 apic mtrr pge mca cmov 
pse36 clflush mmx fxsr sse sse2 pni hypervisor

bogomips: 5999.27
clflush size: 64
power management:

processor: 1
vendor_id: GenuineIntel
cpu family: 6
model: 2
model name: QEMU Virtual CPU version 0.10.50
stepping: 3
cpu MHz: 2999.638
cache size: 2048 KB
fdiv_bug: no
hlt_bug: no
f00f_bug: no
coma_bug: no
fpu: yes
fpu_exception: yes
cpuid level: 2
wp: yes
flags: fpu de pse tsc msr pae mce cx8 apic mtrr pge mca cmov 
pse36 clflush mmx fxsr sse sse2 pni hypervisor

bogomips: 5999.27
clflush size: 64
power management:

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Qemu 0.10.50 CAN NOT get ip from dhcp server

2009-07-16 Thread Amit Shah
On (Thu) Jul 16 2009 [20:52:36], John Wong wrote:

 Any messages in the host kernel's logs? Also can you try running a
 32-bit VM and report what /proc/cpuinfo shows? I'm assuming you're
 running on a 64 bit host. Can you confirm that?

 Yes, i installed Debian/32-bit VM, below is cat /proc/cpuinfo.

Is the host a 64-bit host and are you running a 64 bit kernel on the
host?

Amit
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv4] uio: add generic driver for PCI 2.3 devices

2009-07-16 Thread Sheng Yang
On Thursday 16 July 2009 20:31:01 Michael S. Tsirkin wrote:
 On Wed, Jul 15, 2009 at 03:08:29PM -0700, Greg KH wrote:
  On Wed, Jul 15, 2009 at 11:13:40PM +0300, Michael S. Tsirkin wrote:
   This adds a generic uio driver that can bind to any PCI device.  First
   user will be virtualization where a qemu userspace process needs to
   give guest OS access to the device.
  
   Interrupts are handled using the Interrupt Disable bit in the PCI
   command register and Interrupt Status bit in the PCI status register. 
   All devices compliant to PCI 2.3 (circa 2002) and all compliant PCI
   Express devices should support these bits.  Driver detects this
   support, and won't bind to devices which do not support the Interrupt
   Disable Bit in the command register.
  
   It's expected that more features of interest to virtualization will be
   added to this driver in the future. Possibilities are: mmap for device
   resources, MSI/MSI-X, eventfd (to interface with kvm), iommu.
  
   Acked-by: Chris Wright chr...@redhat.com
   Signed-off-by: Michael S. Tsirkin m...@redhat.com
   ---
  
   Hans, Greg, please review and consider for upstream.
  
   This is intended to solve the problem in virtualization that shared
   interrupts do not work with assigned devices. Earlier versions of this
   patch have circulated on k...@vger.
 
  How does this play with the pci-stub driver that I thought was written
  to solve this very problem?

 AFAIK the problem pci stub was written to solve is simply to bind to a
 device. You then have to use another kernel module which looks the
 device up with something like pci_get_bus_and_slot to do anything
 useful. In particular, for non-shared interrupts, we can disable the
 interrupt in the apic. But this does not work well for shared
 interrupts. Thus this work.

 The uio driver will be used in virtualization scenarious, a couple
 of possible ones that have been mentioned on the kvm list are:
 - device assignment (guest access to device) for simple devices with
   shared interrupts: emulating PCI is tricky enough to better be done in
   userspace. shared interrupt support is important as it happens
   with real devices

One comments for shared interrupt: if you means guest device shares interrupt 
with device in other domain(that means guest or host), it's still a security 
hole, and our position seems still won't-do it. Could you explain how the 
situation change with this patch? I am not sure if I understand your meaning 
completely...

Thanks.

-- 
regards
Yang, Sheng

 - simple communication between guest and host:
   we create a virtual device in host, and userspace
   driver in guest gets events and passes them on
   to e.g. dbus. shared interrupt support is important
   to avoid wasting irqs

   Will it conflict?

 No in a sense that you can't bind both drivers to the same device.

  In fact, it looks like you copied the comments for this driver directly
  from the pci-stub driver :)

 Right.

  How about moving that documentation into a place that people will notice
  it, like the rest of the UIO documentation?

 OK.

  And right now you are just sending the irq to userspace, what is
  userspace supposed to do with it?

 Userspace uses libpci (i.e. pci sysfs) to talk to the device and to
 re-enable interrupts by writing to the command register.

 In the case of device assignment, this will be qemu which
 acts as a proxy for driver running in guest context.
 In case of uio loaded in guest, the driver will be in guest
 userspace, and the device is emulated in qemu.

  Do you have a userspace program that
  uses this interface today to verify that everything works?  If so, care
  to provide a pointer to it?

 Sure. I used an emulated device for this.
 First, you patch qemu to add the device:
 http://www.linux-kvm.org/downloads/mst/test_irq.patch

 Now, run with the new kernel (-kernel flag), adding
  -device test-irq

 Once in guest, assign the device id
 echo 1af4 2009  /sys/bus/pci/drivers/uio_pci_generic/new_id

 Compile and run this driver:
 http://www.linux-kvm.org/downloads/mst/testuio.c
 (it does not use any libraries besides libc,
  so just gcc testuio.c -o testuio)

 And now make the device assert interrupts, like this:

 while
 sleep 1
 do
 setpci -s 00:04.0 0x40.B=0x1
 done

 You should see messages printed as the driver gets interrupts, but no
 error messages about missed interrupts.

  thanks,
 
  greg k-h

 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH] rev5: support colon in filenames

2009-07-16 Thread Markus Armbruster
Anthony Liguori aligu...@us.ibm.com writes:

 Blue Swirl wrote:
 Then how about something like:
  -drive name=hda,if=ide,cache=off,file_is_arg -filearg foo.img
  -drive name=vda,if=virtio,cache=writeback,file_comes_next  -patharg  foo.img
  -drive name=sdb,if=scsi,unit=1,fnarg -fnarg boo.img
   

 The explicit ordering part seems clunky to me.  How about:

 -drive name=vda,if=virtio -drive.vda.file filename.img

 What's nice about this syntax is it generalizes well.  You could have:

 -drive.vda.if virtio -drive.vda.file filename.img
 -net nic,model=rtl8139,name=foo -net.foo.macaddr 00:11:43:55:44:22

Sanest proposal so far.  Just put filenames in separate arguments, as
with almost every other program.

Instead of name=, let's use id= from Gerd's qdev work.

Why -drive.ID.NAME VALUE, -net.ID.NAME VALUE and so forth, i.e. one
option per object with parameters?  Assuming the ID name space is flat,
a single option suffices.  What about -set ID.NAME=VALUE?

Quoting is problematic.  Not only because it necessarily breaks some
filenames that used to work, also because the shell quotes, too.  I
don't enjoy counting backslashes.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv4] uio: add generic driver for PCI 2.3 devices

2009-07-16 Thread Michael S. Tsirkin
On Thu, Jul 16, 2009 at 09:33:05PM +0800, Sheng Yang wrote:
 On Thursday 16 July 2009 20:31:01 Michael S. Tsirkin wrote:
  On Wed, Jul 15, 2009 at 03:08:29PM -0700, Greg KH wrote:
   On Wed, Jul 15, 2009 at 11:13:40PM +0300, Michael S. Tsirkin wrote:
This adds a generic uio driver that can bind to any PCI device.  First
user will be virtualization where a qemu userspace process needs to
give guest OS access to the device.
   
Interrupts are handled using the Interrupt Disable bit in the PCI
command register and Interrupt Status bit in the PCI status register. 
All devices compliant to PCI 2.3 (circa 2002) and all compliant PCI
Express devices should support these bits.  Driver detects this
support, and won't bind to devices which do not support the Interrupt
Disable Bit in the command register.
   
It's expected that more features of interest to virtualization will be
added to this driver in the future. Possibilities are: mmap for device
resources, MSI/MSI-X, eventfd (to interface with kvm), iommu.
   
Acked-by: Chris Wright chr...@redhat.com
Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
   
Hans, Greg, please review and consider for upstream.
   
This is intended to solve the problem in virtualization that shared
interrupts do not work with assigned devices. Earlier versions of this
patch have circulated on k...@vger.
  
   How does this play with the pci-stub driver that I thought was written
   to solve this very problem?
 
  AFAIK the problem pci stub was written to solve is simply to bind to a
  device. You then have to use another kernel module which looks the
  device up with something like pci_get_bus_and_slot to do anything
  useful. In particular, for non-shared interrupts, we can disable the
  interrupt in the apic. But this does not work well for shared
  interrupts. Thus this work.
 
  The uio driver will be used in virtualization scenarious, a couple
  of possible ones that have been mentioned on the kvm list are:
  - device assignment (guest access to device) for simple devices with
shared interrupts: emulating PCI is tricky enough to better be done in
userspace. shared interrupt support is important as it happens
with real devices
 
 One comments for shared interrupt: if you means guest device shares interrupt 
 with device in other domain(that means guest or host), it's still a security 
 hole, and our position seems still won't-do it. Could you explain how the 
 situation change with this patch? I am not sure if I understand your meaning 
 completely...
 
 Thanks.

Yes, this lets you safely share an interrupt between guests. Here's how this 
works:
a device asserts interrupt
host (kernel) sets INTD bit in device, wakes up guest
guest handles interrupt and acks it
host (userspace) clears INTD bit in device

As you see, INTD bit is under control of the host, thus guest can not
deny service to other devices sharing the interrupt.

Performance is likely to be lower than with non-shared interrupts,
but that's often the case with interrupt sharing anyway.
-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/11] Move irq ack notifier list to arch independent code.

2009-07-16 Thread Gleb Natapov
Mask irq notifier list is already there.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/ia64/include/asm/kvm_host.h |1 -
 arch/x86/include/asm/kvm_host.h  |1 -
 include/linux/kvm_host.h |1 +
 virt/kvm/irq_comm.c  |4 ++--
 virt/kvm/kvm_main.c  |1 +
 5 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index d9b6325..a362e67 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -475,7 +475,6 @@ struct kvm_arch {
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain;
int iommu_flags;
-   struct hlist_head irq_ack_notifier_list;
 
unsigned long irq_sources_bitmap;
unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 08732d7..eb723a7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -401,7 +401,6 @@ struct kvm_arch{
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
-   struct hlist_head irq_ack_notifier_list;
int vapics_in_nmi_mode;
 
unsigned int tss_addr;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2490816..b53a5b8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -168,6 +168,7 @@ struct kvm {
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
struct kvm_kernel_irq_routing_entry *irq_routing;
struct hlist_head mask_notifier_list;
+   struct hlist_head irq_ack_notifier_list;
 #endif
 
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 0283a2b..cce32de 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -183,7 +183,7 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned 
irqchip, unsigned pin)
}
rcu_read_unlock();
 
-   hlist_for_each_entry(kian, n, kvm-arch.irq_ack_notifier_list, link)
+   hlist_for_each_entry(kian, n, kvm-irq_ack_notifier_list, link)
if (kian-gsi == gsi)
kian-irq_acked(kian);
 }
@@ -192,7 +192,7 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
   struct kvm_irq_ack_notifier *kian)
 {
mutex_lock(kvm-irq_lock);
-   hlist_add_head(kian-link, kvm-arch.irq_ack_notifier_list);
+   hlist_add_head(kian-link, kvm-irq_ack_notifier_list);
mutex_unlock(kvm-irq_lock);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 75bf72f..ceaa478 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -946,6 +946,7 @@ static struct kvm *kvm_create_vm(void)
goto out;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
INIT_HLIST_HEAD(kvm-mask_notifier_list);
+   INIT_HLIST_HEAD(kvm-irq_ack_notifier_list);
 #endif
 
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
-- 
1.6.2.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/11] Convert irq notifiers lists to RCU locking.

2009-07-16 Thread Gleb Natapov
Use RCU locking for mask/ack notifiers lists.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 virt/kvm/irq_comm.c |   20 +++-
 1 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index cce32de..6c57e46 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -181,18 +181,18 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned 
irqchip, unsigned pin)
break;
}
}
-   rcu_read_unlock();
 
-   hlist_for_each_entry(kian, n, kvm-irq_ack_notifier_list, link)
+   hlist_for_each_entry_rcu(kian, n, kvm-irq_ack_notifier_list, link)
if (kian-gsi == gsi)
kian-irq_acked(kian);
+   rcu_read_unlock();
 }
 
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
   struct kvm_irq_ack_notifier *kian)
 {
mutex_lock(kvm-irq_lock);
-   hlist_add_head(kian-link, kvm-irq_ack_notifier_list);
+   hlist_add_head_rcu(kian-link, kvm-irq_ack_notifier_list);
mutex_unlock(kvm-irq_lock);
 }
 
@@ -200,8 +200,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian)
 {
mutex_lock(kvm-irq_lock);
-   hlist_del_init(kian-link);
+   hlist_del_init_rcu(kian-link);
mutex_unlock(kvm-irq_lock);
+   synchronize_rcu();
 }
 
 int kvm_request_irq_source_id(struct kvm *kvm)
@@ -248,7 +249,7 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int 
irq,
 {
mutex_lock(kvm-irq_lock);
kimn-irq = irq;
-   hlist_add_head(kimn-link, kvm-mask_notifier_list);
+   hlist_add_head_rcu(kimn-link, kvm-mask_notifier_list);
mutex_unlock(kvm-irq_lock);
 }
 
@@ -256,8 +257,9 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int 
irq,
  struct kvm_irq_mask_notifier *kimn)
 {
mutex_lock(kvm-irq_lock);
-   hlist_del(kimn-link);
+   hlist_del_rcu(kimn-link);
mutex_unlock(kvm-irq_lock);
+   synchronize_rcu();
 }
 
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
@@ -265,11 +267,11 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, 
bool mask)
struct kvm_irq_mask_notifier *kimn;
struct hlist_node *n;
 
-   WARN_ON(!mutex_is_locked(kvm-irq_lock));
-
-   hlist_for_each_entry(kimn, n, kvm-mask_notifier_list, link)
+   rcu_read_lock();
+   hlist_for_each_entry_rcu(kimn, n, kvm-mask_notifier_list, link)
if (kimn-irq == irq)
kimn-func(kimn, mask);
+   rcu_read_unlock();
 }
 
 void kvm_free_irq_routing(struct kvm *kvm)
-- 
1.6.2.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/11] Unregister ack notifier callback on PIT freeing.

2009-07-16 Thread Gleb Natapov

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/i8254.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 137e548..472653c 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -672,6 +672,8 @@ void kvm_free_pit(struct kvm *kvm)
if (kvm-arch.vpit) {
kvm_unregister_irq_mask_notifier(kvm, 0,
   kvm-arch.vpit-mask_notifier);
+   kvm_unregister_irq_ack_notifier(kvm,
+   kvm-arch.vpit-pit_state.irq_ack_notifier);
mutex_lock(kvm-arch.vpit-pit_state.lock);
timer = kvm-arch.vpit-pit_state.pit_timer.timer;
hrtimer_cancel(timer);
-- 
1.6.2.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/11] Move irq routing data structure to rcu locking

2009-07-16 Thread Gleb Natapov

Signed-off-by: Gleb Natapov g...@redhat.com
---
 include/linux/kvm_host.h |2 +-
 virt/kvm/irq_comm.c  |   55 +-
 virt/kvm/kvm_main.c  |1 -
 3 files changed, 26 insertions(+), 32 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f244f11..2490816 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -166,7 +166,7 @@ struct kvm {
 
struct mutex irq_lock;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
-   struct list_head irq_routing; /* of kvm_kernel_irq_routing_entry */
+   struct kvm_kernel_irq_routing_entry *irq_routing;
struct hlist_head mask_notifier_list;
 #endif
 
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 100c267..0283a2b 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -150,7 +150,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int 
irq, int level)
 * IOAPIC.  So set the bit in both. The guest will ignore
 * writes to the unused one.
 */
-   list_for_each_entry(e, kvm-irq_routing, link)
+   rcu_read_lock();
+   for (e = rcu_dereference(kvm-irq_routing); e  e-set; e++) {
if (e-gsi == irq) {
int r = e-set(e, kvm, sig_level);
if (r  0)
@@ -158,6 +159,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int 
irq, int level)
 
ret = r + ((ret  0) ? 0 : ret);
}
+   }
+   rcu_read_unlock();
return ret;
 }
 
@@ -170,12 +173,15 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned 
irqchip, unsigned pin)
 
trace_kvm_ack_irq(irqchip, pin);
 
-   list_for_each_entry(e, kvm-irq_routing, link)
+   rcu_read_lock();
+   for (e = rcu_dereference(kvm-irq_routing); e  e-set; e++) {
if (e-irqchip.irqchip == irqchip 
e-irqchip.pin == pin) {
gsi = e-gsi;
break;
}
+   }
+   rcu_read_unlock();
 
hlist_for_each_entry(kian, n, kvm-arch.irq_ack_notifier_list, link)
if (kian-gsi == gsi)
@@ -266,19 +272,11 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, 
bool mask)
kimn-func(kimn, mask);
 }
 
-static void __kvm_free_irq_routing(struct list_head *irq_routing)
-{
-   struct kvm_kernel_irq_routing_entry *e, *n;
-
-   list_for_each_entry_safe(e, n, irq_routing, link)
-   kfree(e);
-}
-
 void kvm_free_irq_routing(struct kvm *kvm)
 {
-   mutex_lock(kvm-irq_lock);
-   __kvm_free_irq_routing(kvm-irq_routing);
-   mutex_unlock(kvm-irq_lock);
+   /* Called only during vm destruction. Nobody can use the pointer
+  at this stage */
+   kfree(kvm-irq_routing);
 }
 
 static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
@@ -328,43 +326,40 @@ int kvm_set_irq_routing(struct kvm *kvm,
unsigned nr,
unsigned flags)
 {
-   struct list_head irq_list = LIST_HEAD_INIT(irq_list);
-   struct list_head tmp = LIST_HEAD_INIT(tmp);
-   struct kvm_kernel_irq_routing_entry *e = NULL;
+   struct kvm_kernel_irq_routing_entry *new, *old;
unsigned i;
int r;
 
+   /* last elemet is left zeored and indicates the end of the array */
+   new = kzalloc(sizeof(*new) * (nr + 1), GFP_KERNEL);
+
+   if (!new)
+   return -ENOMEM;
+
for (i = 0; i  nr; ++i) {
r = -EINVAL;
if (ue-gsi = KVM_MAX_IRQ_ROUTES)
goto out;
if (ue-flags)
goto out;
-   r = -ENOMEM;
-   e = kzalloc(sizeof(*e), GFP_KERNEL);
-   if (!e)
-   goto out;
-   r = setup_routing_entry(e, ue);
+   r = setup_routing_entry(new + i, ue);
if (r)
goto out;
++ue;
-   list_add(e-link, irq_list);
-   e = NULL;
}
 
mutex_lock(kvm-irq_lock);
-   list_splice(kvm-irq_routing, tmp);
-   INIT_LIST_HEAD(kvm-irq_routing);
-   list_splice(irq_list, kvm-irq_routing);
-   INIT_LIST_HEAD(irq_list);
-   list_splice(tmp, irq_list);
+   old = kvm-irq_routing;
+   rcu_assign_pointer(kvm-irq_routing, new);
mutex_unlock(kvm-irq_lock);
 
+   synchronize_rcu();
+
r = 0;
+   new = old;
 
 out:
-   kfree(e);
-   __kvm_free_irq_routing(irq_list);
+   kfree(new);
return r;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7cd1c10..75bf72f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -945,7 +945,6 @@ static struct kvm *kvm_create_vm(void)
if (IS_ERR(kvm))
goto out;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
-   INIT_LIST_HEAD(kvm-irq_routing);

[PATCH 00/11] [RFC] make interrupt injection lockless (almost)

2009-07-16 Thread Gleb Natapov
Yeah I decided to change the name of the series. Since fine grained
locking is not the main objective of the series.

kvm-irq_lock protects too much stuff, but still fail to protect
everything it was design to protect (see ack notifiers call in pic). I
want to make IRQ injection fast path as lockless as possible. This patch
series split kvm-irq_lock mutex to smaller spinlocks each one protects
only one thing. Irq routing, irq notifier lists and ioapic gain their own
spinlock.  pic is already uses its own lock. This patch series also makes
interrupt injection to lapic lockless (several kvm_irq_delivery_to_apic()
may run in parallel), but access to lapic was never fully locked in the
first place. VCPU could access lapic in parallel with interrupt injection.
Patch 10 changes irq routing data structure to much more efficient one.
Patch 11 introduce API that allows to send MSI message without going
through irq routing table. This allows us among other thing to limit irq
routing table to a small number of entries.

This version of the patch series fix range checking in patch 10 and ioapic
lock is no longer dropped during irq notifiers run. Patch 11 is a new one.

Gleb Natapov (11):
  Move irq routing data structure to rcu locking
  Unregister ack notifier callback on PIT freeing.
  Move irq ack notifier list to arch independent code.
  Convert irq notifiers lists to RCU locking.
  Protect irq_sources_bitmap by kvm-lock instead of kvm-irq_lock
  Move irq routing to its own locking.
  Move irq notifiers lists to its own locking.
  Move IO APIC to its own lock.
  Drop kvm-irq_lock lock.
  Change irq routing table to use gsi indexed array.
  Introduce MSI message sending interface that bypass IRQ routing.

 arch/ia64/include/asm/kvm_host.h |1 -
 arch/ia64/kvm/kvm-ia64.c |   55 +--
 arch/x86/include/asm/kvm_host.h  |3 +-
 arch/x86/kvm/i8254.c |6 +-
 arch/x86/kvm/i8259.c |2 +-
 arch/x86/kvm/lapic.c |7 +-
 arch/x86/kvm/x86.c   |   58 +---
 include/linux/kvm.h  |   10 ++-
 include/linux/kvm_host.h |   21 +++-
 virt/kvm/eventfd.c   |2 -
 virt/kvm/ioapic.c|   53 +++
 virt/kvm/ioapic.h|4 +-
 virt/kvm/irq_comm.c  |  195 +
 virt/kvm/kvm_main.c  |   11 +-
 14 files changed, 275 insertions(+), 153 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/11] Move irq notifiers lists to its own locking.

2009-07-16 Thread Gleb Natapov

Signed-off-by: Gleb Natapov g...@redhat.com
---
 include/linux/kvm_host.h |1 +
 virt/kvm/irq_comm.c  |   16 
 virt/kvm/kvm_main.c  |1 +
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8ca15a0..4eb5c85 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -170,6 +170,7 @@ struct kvm {
spinlock_t irq_routing_lock;
struct hlist_head mask_notifier_list;
struct hlist_head irq_ack_notifier_list;
+   spinlock_t irq_notifier_list_lock;
 #endif
 
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 3dbba41..59c1cde 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -191,17 +191,17 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned 
irqchip, unsigned pin)
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
   struct kvm_irq_ack_notifier *kian)
 {
-   mutex_lock(kvm-irq_lock);
+   spin_lock(kvm-irq_notifier_list_lock);
hlist_add_head_rcu(kian-link, kvm-irq_ack_notifier_list);
-   mutex_unlock(kvm-irq_lock);
+   spin_unlock(kvm-irq_notifier_list_lock);
 }
 
 void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian)
 {
-   mutex_lock(kvm-irq_lock);
+   spin_lock(kvm-irq_notifier_list_lock);
hlist_del_init_rcu(kian-link);
-   mutex_unlock(kvm-irq_lock);
+   spin_unlock(kvm-irq_notifier_list_lock);
synchronize_rcu();
 }
 
@@ -247,18 +247,18 @@ void kvm_free_irq_source_id(struct kvm *kvm, int 
irq_source_id)
 void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
struct kvm_irq_mask_notifier *kimn)
 {
-   mutex_lock(kvm-irq_lock);
+   spin_lock(kvm-irq_notifier_list_lock);
kimn-irq = irq;
hlist_add_head_rcu(kimn-link, kvm-mask_notifier_list);
-   mutex_unlock(kvm-irq_lock);
+   spin_unlock(kvm-irq_notifier_list_lock);
 }
 
 void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
  struct kvm_irq_mask_notifier *kimn)
 {
-   mutex_lock(kvm-irq_lock);
+   spin_lock(kvm-irq_notifier_list_lock);
hlist_del_rcu(kimn-link);
-   mutex_unlock(kvm-irq_lock);
+   spin_unlock(kvm-irq_notifier_list_lock);
synchronize_rcu();
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1d70da3..9553444 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -948,6 +948,7 @@ static struct kvm *kvm_create_vm(void)
spin_lock_init(kvm-irq_routing_lock);
INIT_HLIST_HEAD(kvm-mask_notifier_list);
INIT_HLIST_HEAD(kvm-irq_ack_notifier_list);
+   spin_lock_init(kvm-irq_notifier_list_lock);
 #endif
 
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
-- 
1.6.2.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/11] Drop kvm-irq_lock lock.

2009-07-16 Thread Gleb Natapov
The only thing it protects now is interrupt injection into lapic and
this can work lockless. Even now with kvm-irq_lock in place access
to lapic is not entirely serialized since vcpu access doesn't take
kvm-irq_lock.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/ia64/kvm/kvm-ia64.c |2 --
 arch/x86/kvm/i8254.c |2 --
 arch/x86/kvm/lapic.c |2 --
 arch/x86/kvm/x86.c   |2 --
 include/linux/kvm_host.h |1 -
 virt/kvm/eventfd.c   |2 --
 virt/kvm/irq_comm.c  |6 +-
 virt/kvm/kvm_main.c  |5 +
 8 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index dd7ef2d..8f1fc3a 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -998,10 +998,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
if (irqchip_in_kernel(kvm)) {
__s32 status;
-   mutex_lock(kvm-irq_lock);
status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event.irq, irq_event.level);
-   mutex_unlock(kvm-irq_lock);
if (ioctl == KVM_IRQ_LINE_STATUS) {
irq_event.status = status;
if (copy_to_user(argp, irq_event,
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 59021da..211f524 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -690,10 +690,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
struct kvm_vcpu *vcpu;
int i;
 
-   mutex_lock(kvm-irq_lock);
kvm_set_irq(kvm, kvm-arch.vpit-irq_source_id, 0, 1);
kvm_set_irq(kvm, kvm-arch.vpit-irq_source_id, 0, 0);
-   mutex_unlock(kvm-irq_lock);
 
/*
 * Provides NMI watchdog support via Virtual Wire mode.
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 61ffcfc..0380107 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -501,9 +501,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
   irq.vector);
 
-   mutex_lock(apic-vcpu-kvm-irq_lock);
kvm_irq_delivery_to_apic(apic-vcpu-kvm, apic, irq);
-   mutex_unlock(apic-vcpu-kvm-irq_lock);
 }
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index de99b84..40adac5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2261,10 +2261,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
if (irqchip_in_kernel(kvm)) {
__s32 status;
-   mutex_lock(kvm-irq_lock);
status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
irq_event.irq, irq_event.level);
-   mutex_unlock(kvm-irq_lock);
if (ioctl == KVM_IRQ_LINE_STATUS) {
irq_event.status = status;
if (copy_to_user(argp, irq_event,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c9824e0..6bcc79f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -164,7 +164,6 @@ struct kvm {
struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
 #endif
 
-   struct mutex irq_lock;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
struct kvm_kernel_irq_routing_entry *irq_routing;
spinlock_t irq_routing_lock;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 99017e8..95954ad 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -61,10 +61,8 @@ irqfd_inject(struct work_struct *work)
struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
struct kvm *kvm = irqfd-kvm;
 
-   mutex_lock(kvm-irq_lock);
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 1);
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 0);
-   mutex_unlock(kvm-irq_lock);
 }
 
 /*
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 7a2b9db..be30124 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -65,8 +65,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct 
kvm_lapic *src,
int i, r = -1;
struct kvm_vcpu *vcpu, *lowest = NULL;
 
-   WARN_ON(!mutex_is_locked(kvm-irq_lock));
-
if (irq-dest_mode == 0  irq-dest_id == 0xff 
kvm_is_dm_lowest_prio(irq))
printk(KERN_INFO kvm: apic: phys broadcast and lowest prio\n);
@@ -118,7 +116,7 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry 
*e,
return kvm_irq_delivery_to_apic(kvm, NULL, irq);
 }
 
-/* This should be called with the kvm-irq_lock mutex held
+/*
  * Return value:
  *   0   Interrupt was ignored (masked or not delivered for other reasons)
  *  = 0   Interrupt 

[PATCH 11/11] Introduce MSI message sending interface that bypass IRQ routing.

2009-07-16 Thread Gleb Natapov
Sending of MSI using IRQ routing is an artificial concept and potentially
big number of MSIs (2048 per device) make it also inefficient. This
patch adds an interface to inject MSI messages from userspace to lapic
logic directly. The patch also reduces the maximum number of IRQ routing
entries to 128 since MSIs will no longer go there and 128 entries cover
5 ioapics and this ought to be enough for anybody.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/ia64/kvm/kvm-ia64.c |   26 ++
 arch/x86/kvm/x86.c   |   26 ++
 include/linux/kvm.h  |   10 --
 include/linux/kvm_host.h |3 ++-
 virt/kvm/irq_comm.c  |   23 ++-
 5 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 8f1fc3a..c136085 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -195,6 +195,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_IRQCHIP:
case KVM_CAP_MP_STATE:
case KVM_CAP_IRQ_INJECT_STATUS:
+   case KVM_CAP_MSI_MSG:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1010,6 +1011,31 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
break;
}
+   case KVM_MSI_MSG: {
+   struct kvm_irq_routing_msi msg;
+   struct msi_msg msi;
+
+   r = -EFAULT;
+   if (copy_from_user(msg, argp, sizeof msg))
+   goto out;
+   r = -EINVAL;
+   if (!irqchip_in_kernel(kvm))
+   goto out;
+   if (msg.flags)
+   goto out;
+
+   msi.address_lo = msg.address_lo;
+   msi.address_hi = msg.address_hi;
+   msi.data = msg.data;
+
+   msg.status = kvm_set_msi(kvm, msi);
+   if (copy_to_user(argp, msg, sizeof msg)) {
+   r = -EFAULT;
+   goto out;
+   }
+   r = 0;
+   break;
+   }
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip chip;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 40adac5..a8815f8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1208,6 +1208,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_IOEVENTFD:
case KVM_CAP_PIT2:
case KVM_CAP_PIT_STATE2:
+   case KVM_CAP_MSI_MSG:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2273,6 +2274,31 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
break;
}
+   case KVM_MSI_MSG: {
+   struct kvm_irq_routing_msi msg;
+   struct msi_msg msi;
+
+   r = -EFAULT;
+   if (copy_from_user(msg, argp, sizeof msg))
+   goto out;
+   r = -EINVAL;
+   if (!irqchip_in_kernel(kvm))
+   goto out;
+   if (msg.flags)
+   goto out;
+
+   msi.address_lo = msg.address_lo;
+   msi.address_hi = msg.address_hi;
+   msi.data = msg.data;
+
+   msg.status = kvm_set_msi(kvm, msi);
+   if (copy_to_user(argp, msg, sizeof msg)) {
+   r = -EFAULT;
+   goto out;
+   }
+   r = 0;
+   break;
+   }
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 230a91a..19bc586 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -435,6 +435,7 @@ struct kvm_ioeventfd {
 #define KVM_CAP_PIT_STATE2 35
 #endif
 #define KVM_CAP_IOEVENTFD 36
+#define KVM_CAP_MSI_MSG 37
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -446,8 +447,11 @@ struct kvm_irq_routing_irqchip {
 struct kvm_irq_routing_msi {
__u32 address_lo;
__u32 address_hi;
-   __u32 data;
-   __u32 pad;
+   union {
+   __u32 data;
+   __s32 status;
+   };
+   __u32 flags;
 };
 
 /* gsi routing entry types */
@@ -591,6 +595,8 @@ struct kvm_irqfd {
 #define KVM_X86_SETUP_MCE _IOW(KVMIO,  0x9c, __u64)
 #define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO,  0x9d, __u64)
 #define KVM_X86_SET_MCE   _IOW(KVMIO,  0x9e, struct kvm_x86_mce)
+#define KVM_MSI_MSG\
+   _IOWR(KVMIO,  0x9f, struct kvm_irq_routing_msi)
 
 /*
  * Deprecated interfaces
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2715e59..f711a7d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -406,6 +406,7 @@ void kvm_get_intr_delivery_bitmask(struct 

[PATCH 05/11] Protect irq_sources_bitmap by kvm-lock instead of kvm-irq_lock

2009-07-16 Thread Gleb Natapov
It is already protected by kvm-lock on device assignment path. Just
take the same lock in the PIT code.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/x86/kvm/i8254.c |2 ++
 virt/kvm/irq_comm.c  |8 
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 472653c..59021da 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -611,7 +611,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
if (!pit)
return NULL;
 
+   mutex_lock(kvm-lock);
pit-irq_source_id = kvm_request_irq_source_id(kvm);
+   mutex_unlock(kvm-lock);
if (pit-irq_source_id  0) {
kfree(pit);
return NULL;
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 6c57e46..ce8fcd3 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -210,7 +210,8 @@ int kvm_request_irq_source_id(struct kvm *kvm)
unsigned long *bitmap = kvm-arch.irq_sources_bitmap;
int irq_source_id;
 
-   mutex_lock(kvm-irq_lock);
+   WARN_ON(!mutex_is_locked(kvm-lock));
+
irq_source_id = find_first_zero_bit(bitmap,
sizeof(kvm-arch.irq_sources_bitmap));
 
@@ -221,7 +222,6 @@ int kvm_request_irq_source_id(struct kvm *kvm)
 
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
set_bit(irq_source_id, bitmap);
-   mutex_unlock(kvm-irq_lock);
 
return irq_source_id;
 }
@@ -230,9 +230,10 @@ void kvm_free_irq_source_id(struct kvm *kvm, int 
irq_source_id)
 {
int i;
 
+   /* during vm destruction this function is called without locking */
+   WARN_ON(!mutex_is_locked(kvm-lock)  atomic_read(kvm-users_count));
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
 
-   mutex_lock(kvm-irq_lock);
if (irq_source_id  0 ||
irq_source_id = sizeof(kvm-arch.irq_sources_bitmap)) {
printk(KERN_ERR kvm: IRQ source ID out of range!\n);
@@ -241,7 +242,6 @@ void kvm_free_irq_source_id(struct kvm *kvm, int 
irq_source_id)
for (i = 0; i  KVM_IOAPIC_NUM_PINS; i++)
clear_bit(irq_source_id, kvm-arch.irq_states[i]);
clear_bit(irq_source_id, kvm-arch.irq_sources_bitmap);
-   mutex_unlock(kvm-irq_lock);
 }
 
 void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
-- 
1.6.2.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/11] Change irq routing table to use gsi indexed array.

2009-07-16 Thread Gleb Natapov
Use gsi indexed array instead of scanning all entries on each interrupt
injection. Also maintain back mapping from irqchip/pin to gsi to speedup
interrupt acknowledgment notifications.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 include/linux/kvm_host.h |   11 +++-
 virt/kvm/irq_comm.c  |   57 +
 2 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6bcc79f..2715e59 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -128,7 +128,14 @@ struct kvm_kernel_irq_routing_entry {
} irqchip;
struct msi_msg msi;
};
-   struct list_head link;
+   struct hlist_node link;
+};
+
+struct kvm_irq_routing_table {
+   int chip[3][KVM_IOAPIC_NUM_PINS];
+   struct kvm_kernel_irq_routing_entry *rt_entries;
+   u32 max_gsi;
+   struct hlist_head map[0];
 };
 
 struct kvm {
@@ -165,7 +172,7 @@ struct kvm {
 #endif
 
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
-   struct kvm_kernel_irq_routing_entry *irq_routing;
+   struct kvm_irq_routing_table *irq_routing;
spinlock_t irq_routing_lock;
struct hlist_head mask_notifier_list;
struct hlist_head irq_ack_notifier_list;
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index be30124..ae4 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -128,6 +128,8 @@ static int __kvm_set_irq(struct kvm *kvm, int 
irq_source_id, int irq, int level,
struct kvm_kernel_irq_routing_entry *e;
unsigned long *irq_state, sig_level;
int ret = -1;
+   struct kvm_irq_routing_table *irq_rt;
+   struct hlist_node *n;
 
trace_kvm_set_irq(irq, level, irq_source_id);
 
@@ -150,15 +152,15 @@ static int __kvm_set_irq(struct kvm *kvm, int 
irq_source_id, int irq, int level,
 * writes to the unused one.
 */
rcu_read_lock();
-   for (e = rcu_dereference(kvm-irq_routing); e  e-set; e++) {
-   if (e-gsi == irq) {
+   irq_rt = rcu_dereference(kvm-irq_routing);
+   if (irq  irq_rt-max_gsi)
+   hlist_for_each_entry(e, n, irq_rt-map[irq], link) {
int r = e-set(e, kvm, sig_level, notifier);
if (r  0)
continue;
 
ret = r + ((ret  0) ? 0 : ret);
}
-   }
rcu_read_unlock();
return ret;
 }
@@ -175,21 +177,16 @@ int kvm_notifier_set_irq(struct kvm *kvm, int 
irq_source_id, int irq, int level)
 
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
-   struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_ack_notifier *kian;
struct hlist_node *n;
-   unsigned gsi = pin;
+   unsigned gsi;
 
trace_kvm_ack_irq(irqchip, pin);
 
rcu_read_lock();
-   for (e = rcu_dereference(kvm-irq_routing); e  e-set; e++) {
-   if (e-irqchip.irqchip == irqchip 
-   e-irqchip.pin == pin) {
-   gsi = e-gsi;
-   break;
-   }
-   }
+   gsi = rcu_dereference(kvm-irq_routing)-chip[irqchip][pin];
+   if (gsi == -1)
+   gsi = pin;
 
hlist_for_each_entry_rcu(kian, n, kvm-irq_ack_notifier_list, link)
if (kian-gsi == gsi)
@@ -290,7 +287,8 @@ void kvm_free_irq_routing(struct kvm *kvm)
kfree(kvm-irq_routing);
 }
 
-static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+  struct kvm_kernel_irq_routing_entry *e,
   const struct kvm_irq_routing_entry *ue)
 {
int r = -EINVAL;
@@ -316,6 +314,9 @@ static int setup_routing_entry(struct 
kvm_kernel_irq_routing_entry *e,
}
e-irqchip.irqchip = ue-u.irqchip.irqchip;
e-irqchip.pin = ue-u.irqchip.pin + delta;
+   if (e-irqchip.pin  KVM_IOAPIC_NUM_PINS)
+   goto out;
+   rt-chip[ue-u.irqchip.irqchip][e-irqchip.pin] = ue-gsi;
break;
case KVM_IRQ_ROUTING_MSI:
e-set = kvm_set_msi;
@@ -326,6 +327,8 @@ static int setup_routing_entry(struct 
kvm_kernel_irq_routing_entry *e,
default:
goto out;
}
+
+   hlist_add_head(e-link, rt-map[e-gsi]);
r = 0;
 out:
return r;
@@ -337,23 +340,37 @@ int kvm_set_irq_routing(struct kvm *kvm,
unsigned nr,
unsigned flags)
 {
-   struct kvm_kernel_irq_routing_entry *new, *old;
-   unsigned i;
+   struct kvm_irq_routing_table *new, *old;
+   u32 i, j, max_gsi = 0;
int r;
 
-   /* last elemet is left zeored and indicates the end of the array */
-   new = kzalloc(sizeof(*new) * (nr + 1), GFP_KERNEL);
+   for (i = 

[PATCH 06/11] Move irq routing to its own locking.

2009-07-16 Thread Gleb Natapov

Signed-off-by: Gleb Natapov g...@redhat.com
---
 include/linux/kvm_host.h |1 +
 virt/kvm/irq_comm.c  |5 ++---
 virt/kvm/kvm_main.c  |1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b53a5b8..8ca15a0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -167,6 +167,7 @@ struct kvm {
struct mutex irq_lock;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
struct kvm_kernel_irq_routing_entry *irq_routing;
+   spinlock_t irq_routing_lock;
struct hlist_head mask_notifier_list;
struct hlist_head irq_ack_notifier_list;
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index ce8fcd3..3dbba41 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -350,11 +350,10 @@ int kvm_set_irq_routing(struct kvm *kvm,
++ue;
}
 
-   mutex_lock(kvm-irq_lock);
+   spin_lock(kvm-irq_routing_lock);
old = kvm-irq_routing;
rcu_assign_pointer(kvm-irq_routing, new);
-   mutex_unlock(kvm-irq_lock);
-
+   spin_unlock(kvm-irq_routing_lock);
synchronize_rcu();
 
r = 0;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ceaa478..1d70da3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -945,6 +945,7 @@ static struct kvm *kvm_create_vm(void)
if (IS_ERR(kvm))
goto out;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
+   spin_lock_init(kvm-irq_routing_lock);
INIT_HLIST_HEAD(kvm-mask_notifier_list);
INIT_HLIST_HEAD(kvm-irq_ack_notifier_list);
 #endif
-- 
1.6.2.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/11] Move IO APIC to its own lock.

2009-07-16 Thread Gleb Natapov
Introduce new function kvm_notifier_set_irq() that should be used
to change irq line level from irq notifiers. When irq notifier
change irq line level it calls into irq chip code recursively. The
function avoids taking a lock recursively.

Signed-off-by: Gleb Natapov g...@redhat.com
---
 arch/ia64/kvm/kvm-ia64.c|   27 ++-
 arch/x86/include/asm/kvm_host.h |2 +-
 arch/x86/kvm/i8259.c|2 +-
 arch/x86/kvm/lapic.c|5 +---
 arch/x86/kvm/x86.c  |   30 ++---
 include/linux/kvm_host.h|3 +-
 virt/kvm/ioapic.c   |   53 --
 virt/kvm/ioapic.h   |4 ++-
 virt/kvm/irq_comm.c |   25 ++
 virt/kvm/kvm_main.c |2 +-
 10 files changed, 101 insertions(+), 52 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 0ad09f0..dd7ef2d 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -850,9 +850,16 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm,
 
r = 0;
switch (chip-chip_id) {
-   case KVM_IRQCHIP_IOAPIC:
-   memcpy(chip-chip.ioapic, ioapic_irqchip(kvm),
-   sizeof(struct kvm_ioapic_state));
+   case KVM_IRQCHIP_IOAPIC: {
+   struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+   if (ioapic) {
+   spin_lock(ioapic-lock);
+   memcpy(chip-chip.ioapic, ioapic,
+  sizeof(struct kvm_ioapic_state));
+   spin_unlock(ioapic-lock);
+   } else
+   r = -EINVAL;
+   }
break;
default:
r = -EINVAL;
@@ -867,10 +874,16 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, 
struct kvm_irqchip *chip)
 
r = 0;
switch (chip-chip_id) {
-   case KVM_IRQCHIP_IOAPIC:
-   memcpy(ioapic_irqchip(kvm),
-   chip-chip.ioapic,
-   sizeof(struct kvm_ioapic_state));
+   case KVM_IRQCHIP_IOAPIC: {
+   struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+   if (ioapic) {
+   spin_lock(ioapic-lock);
+   memcpy(ioapic, chip-chip.ioapic,
+  sizeof(struct kvm_ioapic_state));
+   spin_unlock(ioapic-lock);
+   } else
+   r = -EINVAL;
+   }
break;
default:
r = -EINVAL;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index eb723a7..cce2071 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -621,7 +621,7 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned 
nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
   u32 error_code);
 
-int kvm_pic_set_irq(void *opaque, int irq, int level);
+int kvm_pic_set_irq(void *opaque, int irq, int level, bool notifier);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index daf4606..d9baad7 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -182,7 +182,7 @@ void kvm_pic_update_irq(struct kvm_pic *s)
pic_unlock(s);
 }
 
-int kvm_pic_set_irq(void *opaque, int irq, int level)
+int kvm_pic_set_irq(void *opaque, int irq, int level, bool notifier)
 {
struct kvm_pic *s = opaque;
int ret = -1;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e2e2849..61ffcfc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -471,11 +471,8 @@ static void apic_set_eoi(struct kvm_lapic *apic)
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
-   if (!(apic_get_reg(apic, APIC_SPIV)  APIC_SPIV_DIRECTED_EOI)) {
-   mutex_lock(apic-vcpu-kvm-irq_lock);
+   if (!(apic_get_reg(apic, APIC_SPIV)  APIC_SPIV_DIRECTED_EOI))
kvm_ioapic_update_eoi(apic-vcpu-kvm, vector, trigger_mode);
-   mutex_unlock(apic-vcpu-kvm-irq_lock);
-   }
 }
 
 static void apic_send_ipi(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 48567fa..de99b84 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2010,10 +2010,16 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, 
struct kvm_irqchip *chip)
pic_irqchip(kvm)-pics[1],
sizeof(struct kvm_pic_state));
break;
-   case KVM_IRQCHIP_IOAPIC:
-   memcpy(chip-chip.ioapic,
-   ioapic_irqchip(kvm),
-   sizeof(struct kvm_ioapic_state));
+   case KVM_IRQCHIP_IOAPIC: {
+   struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+   if 

Re: [PATCHv4] uio: add generic driver for PCI 2.3 devices

2009-07-16 Thread Michael S. Tsirkin
On Wed, Jul 15, 2009 at 03:08:29PM -0700, Greg KH wrote:
 How about moving that documentation into a place that people will notice
 it, like the rest of the UIO documentation?

Greg,

would it make more sense to add this to
Documentation/DocBook/uio-howto.xml, or to create
Documentation/uio_pci_generic.txt ?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH] rev5: support colon in filenames

2009-07-16 Thread Anthony Liguori

Markus Armbruster wrote:

Anthony Liguori aligu...@us.ibm.com writes:

  

Blue Swirl wrote:


Then how about something like:
 -drive name=hda,if=ide,cache=off,file_is_arg -filearg foo.img
 -drive name=vda,if=virtio,cache=writeback,file_comes_next  -patharg  foo.img
 -drive name=sdb,if=scsi,unit=1,fnarg -fnarg boo.img
  
  

The explicit ordering part seems clunky to me.  How about:

-drive name=vda,if=virtio -drive.vda.file filename.img

What's nice about this syntax is it generalizes well.  You could have:

-drive.vda.if virtio -drive.vda.file filename.img
-net nic,model=rtl8139,name=foo -net.foo.macaddr 00:11:43:55:44:22



Sanest proposal so far.  Just put filenames in separate arguments, as
with almost every other program.

Instead of name=, let's use id= from Gerd's qdev work.
  


Works for me.


Why -drive.ID.NAME VALUE, -net.ID.NAME VALUE and so forth, i.e. one
option per object with parameters?  Assuming the ID name space is flat,
a single option suffices.  What about -set ID.NAME=VALUE?
  


Looks attractive on the surface.  Feels really difficult to implement :-)


Quoting is problematic.  Not only because it necessarily breaks some
filenames that used to work, also because the shell quotes, too.  I
don't enjoy counting backslashes.
  


Yup.

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


init scripts for KVM guests

2009-07-16 Thread Saman Behnam
Hi there,


I've wrote a script for starting and stopping the KVM guest machines
either by saving and restoring or by graceful shutdown and start. My
question, is there an option within libvirt to make the KVM guest
machines get saved when the KVM host shuts down? I had problems to
gracefully shutdown Windows VM's!!!


Right now I had to write these scripts to save and restore the machines
upon starting and stopping the KVM host.


Many Thanks for your response



vim /etc/init.d/kvm-win0-rcs


###


#!/bin/sh


### BEGIN INIT INFO
# Provides:  kvm-win0-rcs
# Required-Start:$kvm $libvirtd
# Required-Stop: $kvm $libvirtd
# Default-Start: 2 3 4 5
# Default-Stop:  0 1 6
# Short-Description: Starting/Saving, Stopping/Restoring kvm-win0-rcs
KVM guest.
# Description:   Starting/Saving, Stopping/Restoring KVM guest win0.
# kvm-win0-rcs KVM guest.
### END INIT INFO#


# Please copy the init scripts of the KVM VM's in to /etc/init.d and
then run! for i in `cd /etc/init.d  ls kvm-*` ; do update-rc.d  $i
defaults 21 19 ; done

# U need per VM a separate start-stop-script!
VM_NAME=win0 # Name of ur guest VM in the XML config file located in
/etc/libvirt/qemu/.
TIMEOUT=300
VM_SAVE_DIR=/kvm/tmp/vm_save_dir # Directory for storing the guest VM
save data.
DEVNULL=/dev/null
#ACTION=shutdown
ACTION=save # ACTION takes one argument, either save or shutdown.
In the save case it will save the VM and restore it while stopping or
starting. In the shutdown case it will shutdown then VM and start it
while stopping or starting.



vmshutdown () {
if ! virsh list|grep running|grep -v grep|grep $VM_NAME 
$DEVNULL 21; then
echo VM $VM_NAME is not running!!! Exiting.
exit 1
fi
PID=$(ps ax|grep $VM_NAME|grep kvm|cut -c 1-6)
COUNT=0
echo kvmshutdown \: Shutting down $VM_NAME with pid $PID
virsh shutdown $VM_NAME  exit 0
while [ $COUNT -lt $TIMEOUT ]; do
ps --pid $PID  $DEVNULL 21
if [ $? -eq 1 ];  then
exit 0
fi
sleep 5
COUNT=$(($COUNT+5))
done
echo kvmshutdown \: Timeout happend. Destroying VM $VM_NAME
virsh destroy $VM_NAME  exit 1
}


vmstart () {
if virsh list|grep running|grep -v grep|grep $VM_NAME 
$DEVNULL 21; then
echo VM $VM_NAME is allready running!!! Exiting.
exit 1
fi
virsh start $VM_NAME  exit 0
exit 1
}

vmsave () {
if ! virsh list|grep running|grep -v grep|grep $VM_NAME 
$DEVNULL 21; then
echo VM $VM_NAME is not running!!! Exiting.
exit 1
fi
PID=$(ps ax|grep $VM_NAME|grep kvm|cut -c 1-6)
COUNT=0
echo vmsave \: Saving VM $VM_NAME
if [ ! -d $VM_SAVE_DIR ]; then
echo creating  $VM_SAVE_DIR
mkdir -p $VM_SAVE_DIR
fi
virsh save $VM_NAME $VM_SAVE_DIR/$VM_NAME.kvm.save  exit 0
while [ $COUNT -lt $TIMEOUT ]; do
ps --pid $PID  $DEVNULL 21
if [ $? -eq 1 ];  then
exit 0
fi
sleep 5
COUNT=$(($COUNT+5))
done
echo vmsave \: Timeout happend. Destroying VM $VM_NAME
virsh destroy $VM_NAME
exit 1
}

vmrestore () {
if virsh list|grep running|grep -v grep|grep $VM_NAME 
$DEVNULL 21; then
echo VM $VM_NAME is allready running!!! Exiting.
exit 1
fi
if ! ls $VM_SAVE_DIR/$VM_NAME.kvm.save  /dev/null 21 ; then
echo Restore file $VM_NAME.kvm.save in $VM_SAVE_DIR
not found! Starting VM!!!
vmstart
exit 1
fi
cd $VM_SAVE_DIR
echo vmrestore \: Restoring VM $VM_NAME
virsh restore $VM_SAVE_DIR/$VM_NAME.kvm.save  rm -f
$VM_SAVE_DIR/$VM_NAME.kvm.save
exit 0
}


# Here is the start of the main program :-)
case $1 in
start)
case $ACTION in
save)
vmrestore
;;
shutdown)
vmstart
;;
esac
;;
stop)
case $ACTION in
save)
vmsave
;;
shutdown)
vmshutdown
;;
esac
;;

*)
N=$0
echo Usage: $N {start|stop} 2
exit 1
;;
esac
exit 0



Re: [PATCHv4] uio: add generic driver for PCI 2.3 devices

2009-07-16 Thread Hans J. Koch
On Thu, Jul 16, 2009 at 05:07:10PM +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 15, 2009 at 03:08:29PM -0700, Greg KH wrote:
  How about moving that documentation into a place that people will notice
  it, like the rest of the UIO documentation?
 
 Greg,
 
 would it make more sense to add this to
 Documentation/DocBook/uio-howto.xml, or to create
 Documentation/uio_pci_generic.txt ?

Hi Michael,
I'd prefer to have it in uio-howto.xml so that people only have to look in
one place. In does not have to be very detailled, just a short explanation
what this driver is all about and a short example how to use it.

Thanks,
Hans

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH] rev5: support colon in filenames

2009-07-16 Thread Gerd Hoffmann

On 07/16/09 15:43, Markus Armbruster wrote:


Why -drive.ID.NAME VALUE, -net.ID.NAME VALUE and so forth, i.e. one
option per object with parameters?  Assuming the ID name space is flat,
a single option suffices.  What about -set ID.NAME=VALUE?


Hmm, I think we will have multiple namespaces.  At least one for guest 
devices and one for host backends.  Probably also different namespaces 
for different kinds of host backends (disk / net / char / ...).


-set or maybe -drive-set id.name= should be easier to handle then 
-drive.id.name with qemu's command line option parser.


cheers,
  Gerd
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH] rev5: support colon in filenames

2009-07-16 Thread Gerd Hoffmann

On 07/16/09 16:10, Anthony Liguori wrote:

Why -drive.ID.NAME VALUE, -net.ID.NAME VALUE and so forth, i.e. one
option per object with parameters? Assuming the ID name space is flat,
a single option suffices. What about -set ID.NAME=VALUE?


Looks attractive on the surface. Feels really difficult to implement :-)


Shouldn't be that horrible.  Look at QemuOpts in the drive patches v3 ;)

cheers,
  Gerd

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm-390: fix wait_queue handling

2009-07-16 Thread Christian Bornträger
From: Christian Borntraeger borntrae...@de.ibm.com

There are two waitqueues in kvm for wait handling:
vcpu-wq for virt/kvm/kvm_main.c and
vpcu-arch.local_int.wq for the s390 specific wait code.

the wait handling in kvm_s390_handle_wait was broken by using different
wait_queues for add_wait queue and remove_wait_queue.

There are two options to fix the problem: 
o  move all the s390 specific code to vcpu-wq and remove
   vcpu-arch.local_int.wq
o  move all the s390 specific code to vcpu-arch.local_int.wq

This patch chooses the 2nd variant for two reasons:
o  s390 does not use kvm_vcpu_block but implements its own enabled wait
   handling.
   Having a separate wait_queue make it clear, that our wait mechanism is
   different
o  the patch is much smaller

Report-by:  Julia Lawall ju...@diku.dk
Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
---
 arch/s390/kvm/interrupt.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: kvm/arch/s390/kvm/interrupt.c
===
--- kvm.orig/arch/s390/kvm/interrupt.c
+++ kvm/arch/s390/kvm/interrupt.c
@@ -386,7 +386,7 @@ no_timer:
}
__unset_cpu_idle(vcpu);
__set_current_state(TASK_RUNNING);
-   remove_wait_queue(vcpu-wq, wait);
+   remove_wait_queue(vcpu-arch.local_int.wq, wait);
spin_unlock_bh(vcpu-arch.local_int.lock);
spin_unlock(vcpu-arch.local_int.float_int-lock);
hrtimer_try_to_cancel(vcpu-arch.ckc_timer);

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM PATCH] xinterface

2009-07-16 Thread Gregory Haskins
(Applies to kvm.git/master:84a3c081)

For details, please read the patch header.

Background: The original vbus code was tightly integrated with kvm.ko.  Avi
suggested that we abstract the interfaces such that it could live outside
of kvm.  Part of that discussion turned into what is now irqfd/ioeventfd
checked into kvm.git.  The other part of the discussion (pointer-translation)
was suggested to be addressed by having the memory-slot info replicated
across interested modules.

I was looking into what is required for essentially hooking the
SET_MEMORY_REGION operations in QEMU yesterday by talking to Anthony and
Glauber on IRC.  We came to the conclusion that we could possibly
do a minor cleanup on the various callsites of SET_MEMORY_REGION so that
a wrapper function was used.  I could then add a hook at this wrapper
to essentially get notification events whenever memory-slots change, and
could forward this info to my module appropriate.

Anthony made the observation that replicating the slots info is
not the cleanest design, and I tend to agree with him.  Its replicating
data and is going to be prone to synchronization problems, etc.  He
also mentioned that other modules like virtio-net would want access
to this same information at some point, so perhaps a general solution was in
order. 

Therefore, I stepped back from that initial approach of replicating slots
and instead provided a abstract mechanism to utilize the original structures.

This code has been tested with a prototype of the vbus-v4 code and appears
to function properly.

Comments/suggestions?

Regards,
-Greg

---

Gregory Haskins (1):
  KVM: introduce xinterface API for external interaction with guests


 arch/x86/Kbuild|4 +
 arch/x86/kvm/Makefile  |4 +
 arch/x86/kvm/x86.c |1 
 include/linux/kvm.h|2 +
 include/linux/kvm_host.h   |6 ++
 include/linux/kvm_xinterface.h |   58 
 virt/kvm/kvm_main.c|   72 
 virt/kvm/xinterface.c  |  147 
 8 files changed, 293 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/kvm_xinterface.h
 create mode 100644 virt/kvm/xinterface.c

-- 
Signature
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM PATCH] KVM: introduce xinterface API for external interaction with guests

2009-07-16 Thread Gregory Haskins
What: xinterface is a mechanism that allows kernel modules external to
the kvm.ko proper to interface with a running guest.  It accomplishes
this by creating an abstracted interface which does not expose any
private details of the guest or its related KVM structures, and provides
a mechanism to find and bind to this interface at run-time.  This
binding mechanism uses a userspace friendly token u64 vmid as a
handle.  This vmid acts similar to a file-descriptor in the sense that
it can be extracted from a guest, passed to an end-point of interest,
and finally, converted back to a vtable pointer using a stable interface.

Why: There are various subsystems that would like to interact with a KVM
guest which are ideally suited to exist outside the domain of the kvm.ko
core logic. For instance, external pci-passthrough, virtual-bus, and
virtio-net modules are currently under development.  In order for these
modules to successfully interact with the guest, they need, at the very
least, various interfaces for signaling IO events, pointer translation,
and possibly memory mapping.

The signaling case is covered by the recent introduction of the
irqfd/ioeventfd mechanisms.  This patch provides a mechanism to cover the
other cases.  Note that today we only expose pointer-translation related
functions, but more could be added at a future date as needs arise.

Security considerations: This concept is not believed to expose KVM to
any kind of additional security risk.  The vmid token itself can only be
acquired via an open handle to the vmfd (i.e. qemu-kvm), and the interface
is only available within the kernel.  Therefore the xinterface
admission policy is delegated to the kernel/lkm admission policy, which
must be assumed secure or the system is already compromised independent of
this work.

Additionally, the xinterface design is hardened against malformed vmid
tokens, as well as race conditions against valid tokens (e.g. guest
exiting before the token is redeemed).  It is additionally hardened
against races in the kvm.ko module itself by acquiring proper module
references.  As a final measure, we link the xinterface code statically
into the kernel so that callers are guaranteed a stable interface to
kvm_xinterface_find() without implicitly pinning kvm.ko or racing against
it.

Example usage: QEMU instantiates a guest, and an external module foo
that desires the ability to interface with the guest (say via
open(/dev/foo)).  QEMU may then issue a KVM_GET_VMID operation to acquire
the u64-based vmid, and pass it to ioctl(foofd, FOO_SET_VMID, vmid).
Upon receipt, the foo module can issue kvm_xinterface_find(vmid) to acquire
the proper context.  Internally, the struct kvm* and associated
struct module* will remain pinned at least until the foo module calls
kvm_xinterface_put().

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 arch/x86/Kbuild|4 +
 arch/x86/kvm/Makefile  |4 +
 arch/x86/kvm/x86.c |1 
 include/linux/kvm.h|2 +
 include/linux/kvm_host.h   |6 ++
 include/linux/kvm_xinterface.h |   58 
 virt/kvm/kvm_main.c|   72 
 virt/kvm/xinterface.c  |  147 
 8 files changed, 293 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/kvm_xinterface.h
 create mode 100644 virt/kvm/xinterface.c

diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index ad8ec35..9f50cc3 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,5 +1,7 @@
 
-obj-$(CONFIG_KVM) += kvm/
+ifdef CONFIG_KVM
+obj-y += kvm/
+endif
 
 # Xen paravirtualization support
 obj-$(CONFIG_XEN) += xen/
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index afaaa76..80d951d 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -17,3 +17,7 @@ kvm-amd-y += svm.o
 obj-$(CONFIG_KVM)  += kvm.o
 obj-$(CONFIG_KVM_INTEL)+= kvm-intel.o
 obj-$(CONFIG_KVM_AMD)  += kvm-amd.o
+
+ifdef CONFIG_KVM
+obj-y += $(addprefix ../../../virt/kvm/, xinterface.o)
+endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 48567fa..5725527 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1208,6 +1208,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_IOEVENTFD:
case KVM_CAP_PIT2:
case KVM_CAP_PIT_STATE2:
+   case KVM_CAP_XINTERFACE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 230a91a..7790894 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -435,6 +435,7 @@ struct kvm_ioeventfd {
 #define KVM_CAP_PIT_STATE2 35
 #endif
 #define KVM_CAP_IOEVENTFD 36
+#define KVM_CAP_XINTERFACE 37
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -544,6 +545,7 @@ struct kvm_irqfd {
 #define KVM_CREATE_PIT2   _IOW(KVMIO, 0x77, struct 
kvm_pit_config)
 #define KVM_SET_BOOT_CPU_ID_IO(KVMIO, 0x78)
 #define KVM_IOEVENTFD   

Re: Fix migration issue when the destination is loaded

2009-07-16 Thread Mark McLoughlin
On Thu, 2009-07-16 at 15:00 +0300, Dor Laor wrote:
 On 07/16/2009 12:39 PM, Mark McLoughlin wrote:
  Hi Dor,
 
  On Wed, 2009-07-15 at 18:03 +0300, Dor Laor wrote:
  If the migration socket is full, we get EAGAIN for the write.
  The set_fd_handler2 defers the write for later on. The function
  tries to wake up the iothread by qemu_kvm_notify_work.
  Since this happens in a loop, multiple times, the pipe that emulates
  eventfd becomes full and we get a deadlock.
 
  I'm not sure I follow:
 
 - You're seeing qemu_kvm_notify_work() being called many times
 
 - The call chain is migrate_fd_put_buffer(), qemu_set_fd_handler2(),
   main_loop_break()
 
 - This happens when write() in migrate_fd_put_buffer() returns EAGAIN
   because the socket buffer has filled up
 
  Correct?
 
  That sounds like migrate_fd_put_buffer() is being called repeatedly
  while we know the socket isn't writable?
 
  Shouldn't the buffered file could stop attempting to call put_buffer()
  until it has been notified that the underlying fd is writable?
 
 There are two fds here:
 The one returning EAGAIN, is the migration socket. That's why 
 migrate_fd_put_buffer is called and a call back is rescheduled by 
 qemu_set_fd_handler2.
 
 In the procedure of qemu_set_fd_handler2, the main_loop_break is called. 
 It needs to notify the iothread. It does is by qemu_eventfd, since it is 
 being emulated on older kernels, we use a pipe.
 
 The pipe fd is the one that blocks.
 I though of setting it to non-blocking but we might get partial writes 
 with a non blocking fd (in theory) too.

Yes, but if the pipe fd is blocking, that means we're writing a lot of
events to it, which means the write to the migration socket is failing
with EAGAIN a lot.

As soon as we get EAGAIN on the migration socket, we should stop trying
to write to socket until the we get notification that its writable.

If we did that, we'd only be writing a very small number of events to
the pipe and we wouldn't block.

Cheers,
Mark.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH] KVM: introduce xinterface API for external interaction with guests

2009-07-16 Thread Gregory Haskins
Sam Ravnborg wrote:
 diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
 index ad8ec35..9f50cc3 100644
 --- a/arch/x86/Kbuild
 +++ b/arch/x86/Kbuild
 @@ -1,5 +1,7 @@
  
 -obj-$(CONFIG_KVM) += kvm/
 +ifdef CONFIG_KVM
 +obj-y += kvm/
 +endif
 

 What was wrong with the old version?
 If this is because xinterface.o is builtin to the kernel

Bingo

  then we should
 always visit kvm/ and use:

obj-y += kvm/

 unconditionally.

   
Ok, easy enough.

Thanks Sam,
-Greg





signature.asc
Description: OpenPGP digital signature


Re: [KVM PATCH] KVM: introduce xinterface API for external interaction with guests

2009-07-16 Thread Anthony Liguori

Gregory Haskins wrote:

+/*
+ * 
+ * XINTERFACE (External Interface)
+ * -
+ */
+
+static struct kvm *
+intf_to_kvm(struct kvm_xinterface *intf)
+{
+   return container_of(intf, struct kvm, xinterface);
+}
+
+static unsigned long
+xinterface_gpa_to_hva(struct kvm_xinterface *intf, unsigned long gpa)
+{
+   struct kvm *kvm = intf_to_kvm(intf);
+   unsigned long addr;
+
+   addr = gfn_to_hva(kvm, gpa  PAGE_SHIFT);
+   if (kvm_is_error_hva(addr))
+   return 0;
+
+   return addr + offset_in_page(gpa);
+}
+
+static struct page *
+xinterface_gpa_to_page(struct kvm_xinterface *intf, unsigned long gpa)
+{
+   struct kvm *kvm = intf_to_kvm(intf);
+   struct page *page;
+
+   page = gfn_to_page(kvm, gpa  PAGE_SHIFT);
+   if (page == bad_page)
+   return ERR_PTR(-EINVAL);
+
+   return page;
+}
+
+static void
+xinterface_release(struct kvm_xinterface *intf)
+{
+   struct kvm *kvm = intf_to_kvm(intf);
+
+   kvm_put_kvm(kvm);
+}
+
+struct kvm_xinterface_ops _kvm_xinterface_ops = {
+   .gpa_to_hva  = xinterface_gpa_to_hva,
+   .gpa_to_page = xinterface_gpa_to_page,
+   .release = xinterface_release,
+};
  


How do you deal with locking?

The mappings (gpa_to_page) are not fixed.  They can and do change very 
often.  The interface doesn't seem to attempt to cope with this.


Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH] KVM: introduce xinterface API for external interaction with guests

2009-07-16 Thread Gregory Haskins
Anthony Liguori wrote:
 Gregory Haskins wrote:
 +/*
 + * 
 + * XINTERFACE (External Interface)
 + * -
 + */
 +
 +static struct kvm *
 +intf_to_kvm(struct kvm_xinterface *intf)
 +{
 +return container_of(intf, struct kvm, xinterface);
 +}
 +
 +static unsigned long
 +xinterface_gpa_to_hva(struct kvm_xinterface *intf, unsigned long gpa)
 +{
 +struct kvm *kvm = intf_to_kvm(intf);
 +unsigned long addr;
 +
 +addr = gfn_to_hva(kvm, gpa  PAGE_SHIFT);
 +if (kvm_is_error_hva(addr))
 +return 0;
 +
 +return addr + offset_in_page(gpa);
 +}
 +
 +static struct page *
 +xinterface_gpa_to_page(struct kvm_xinterface *intf, unsigned long gpa)
 +{
 +struct kvm *kvm = intf_to_kvm(intf);
 +struct page *page;
 +
 +page = gfn_to_page(kvm, gpa  PAGE_SHIFT);
 +if (page == bad_page)
 +return ERR_PTR(-EINVAL);
 +
 +return page;
 +}
 +
 +static void
 +xinterface_release(struct kvm_xinterface *intf)
 +{
 +struct kvm *kvm = intf_to_kvm(intf);
 +
 +kvm_put_kvm(kvm);
 +}
 +
 +struct kvm_xinterface_ops _kvm_xinterface_ops = {
 +.gpa_to_hva  = xinterface_gpa_to_hva,
 +.gpa_to_page = xinterface_gpa_to_page,
 +.release = xinterface_release,
 +};
   

 How do you deal with locking?

 The mappings (gpa_to_page) are not fixed.  They can and do change very
 often.  The interface doesn't seem to attempt to cope with this.

Hmm, well I used to need gpa_to_page() in the older version of vbus that
did explicit hypercalls, but I don't actually use it today in v4.  I
left it in because it seems like it might be useful in general (perhaps
for Michael).  However, if this ends up being a real problem I suppose
we can just drop that vtable entry and let the guy that actually needs
it deal with the issues ;)  I really only require gpa_to_hva() in the
short-term.

That said, I think the assumption that was made when I was using this
was that a proper ref for the page was acquired by the gfn_to_page() and
dropped by the caller.  This was always used in the context of a
hypercall/vmexit so presumably the gpa should be considered stable
across that call.  Is that not true?

Regards,
-Greg




signature.asc
Description: OpenPGP digital signature


[PATCH -tip -v12 04/11] kprobes: cleanup fix_riprel() using insn decoder on x86

2009-07-16 Thread Masami Hiramatsu
Cleanup fix_riprel() in arch/x86/kernel/kprobes.c by using x86 instruction
decoder.

Signed-off-by: Masami Hiramatsu mhira...@redhat.com
Cc: Ananth N Mavinakayanahalli ana...@in.ibm.com
Cc: Jim Keniston jkeni...@us.ibm.com
Cc: Ingo Molnar mi...@elte.hu
---

 arch/x86/kernel/kprobes.c |  128 -
 1 files changed, 23 insertions(+), 105 deletions(-)

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 5341842..b77e050 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -109,50 +109,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
/*  --- */
/*  0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  */
 };
-static const u32 onebyte_has_modrm[256 / 32] = {
-   /*  0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  */
-   /*  --- */
-   W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
-   W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
-   W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
-   W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
-   W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
-   W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
-   W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
-   W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
-   W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
-   W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
-   W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
-   W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
-   W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
-   W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
-   W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
-   W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1)   /* f0 */
-   /*  --- */
-   /*  0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  */
-};
-static const u32 twobyte_has_modrm[256 / 32] = {
-   /*  0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  */
-   /*  --- */
-   W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
-   W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
-   W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
-   W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
-   W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
-   W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
-   W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
-   W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
-   W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
-   W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
-   W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
-   W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
-   W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
-   W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
-   W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
-   W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* ff */
-   /*  --- */
-   /*  0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  */
-};
 #undef W
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -345,68 +301,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 static void __kprobes fix_riprel(struct kprobe *p)
 {
 #ifdef CONFIG_X86_64
-   u8 *insn = p-ainsn.insn;
-   s64 disp;
-   int need_modrm;
-
-   /* Skip legacy instruction prefixes.  */
-   while (1) {
-   switch (*insn) {
-   case 0x66:
-   case 0x67:
-   case 0x2e:
-   case 0x3e:
-   case 0x26:
-   case 0x64:
-   case 0x65:
-   case 0x36:
-   case 0xf0:
-   case 0xf3:
-   case 0xf2:
-   ++insn;
-   continue;
-   }
-   break;
-   }
+   struct insn insn;
+   kernel_insn_init(insn, p-ainsn.insn);
 
-   /* Skip REX instruction prefix.  */
-   if (is_REX_prefix(insn))
-   ++insn;
-
-   if (*insn == 0x0f) {
-   /* Two-byte opcode.  */
-   ++insn;
-   

[PATCH -tip -v12 06/11] tracing: ftrace dynamic ftrace_event_call support

2009-07-16 Thread Masami Hiramatsu
Add dynamic ftrace_event_call support to ftrace. Trace engines can adds new
ftrace_event_call to ftrace on the fly. Each operator functions of the call
takes a ftrace_event_call data structure as an argument, because these
functions may be shared among several ftrace_event_calls.

Changes from v11:
 - Call remove_subsystem_dir() when unregistering an event call.

Signed-off-by: Masami Hiramatsu mhira...@redhat.com
Acked-by: Frederic Weisbecker fweis...@gmail.com
Cc: Steven Rostedt rost...@goodmis.org
Cc: Ingo Molnar mi...@elte.hu
Cc: Tom Zanussi tzanu...@gmail.com
---

 include/linux/ftrace_event.h |   13 +---
 include/trace/ftrace.h   |   22 ++---
 kernel/trace/trace_events.c  |   72 --
 kernel/trace/trace_export.c  |   27 
 4 files changed, 86 insertions(+), 48 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 5c093ff..f7733b6 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -108,12 +108,13 @@ struct ftrace_event_call {
struct dentry   *dir;
struct trace_event  *event;
int enabled;
-   int (*regfunc)(void);
-   void(*unregfunc)(void);
+   int (*regfunc)(struct ftrace_event_call *);
+   void(*unregfunc)(struct ftrace_event_call *);
int id;
-   int (*raw_init)(void);
-   int (*show_format)(struct trace_seq *s);
-   int (*define_fields)(void);
+   int (*raw_init)(struct ftrace_event_call *);
+   int (*show_format)(struct ftrace_event_call *,
+  struct trace_seq *);
+   int (*define_fields)(struct ftrace_event_call *);
struct list_headfields;
int filter_active;
void*filter;
@@ -138,6 +139,8 @@ extern int filter_current_check_discard(struct 
ftrace_event_call *call,
 
 extern int trace_define_field(struct ftrace_event_call *call, char *type,
  char *name, int offset, int size, int is_signed);
+extern int trace_add_event_call(struct ftrace_event_call *call);
+extern void trace_remove_event_call(struct ftrace_event_call *call);
 
 #define is_signed_type(type)   (((type)(-1))  0)
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 1867553..d696580 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -147,7 +147,8 @@
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, func, print)   \
 static int \
-ftrace_format_##call(struct trace_seq *s)  \
+ftrace_format_##call(struct ftrace_event_call *event_call, \
+struct trace_seq *s)   \
 {  \
struct ftrace_raw_##call field __attribute__((unused)); \
int ret = 0;\
@@ -289,10 +290,9 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int 
flags)   \
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, func, print)   \
 int\
-ftrace_define_fields_##call(void)  \
+ftrace_define_fields_##call(struct ftrace_event_call *event_call)  \
 {  \
struct ftrace_raw_##call field; \
-   struct ftrace_event_call *event_call = event_##call;   \
int ret;\
\
__common_field(int, type, 1);   \
@@ -355,7 +355,7 @@ static inline int ftrace_get_offsets_##call(
\
  * event_trace_printk(_RET_IP_, call:  fmt);
  * }
  *
- * static int ftrace_reg_event_call(void)
+ * static int ftrace_reg_event_call(struct ftrace_event_call *unused)
  * {
  * int ret;
  *
@@ -366,7 +366,7 @@ static inline int ftrace_get_offsets_##call(
\
  * return ret;
  * }
  *
- * static void ftrace_unreg_event_call(void)
+ * static void ftrace_unreg_event_call(struct ftrace_event_call *unused)
  * {
  * unregister_trace_call(ftrace_event_call);
  * }
@@ -399,7 +399,7 @@ static inline int ftrace_get_offsets_##call(
\
  * trace_current_buffer_unlock_commit(event, irq_flags, pc);
  * }
  *
- * static int ftrace_raw_reg_event_call(void)
+ * static 

[PATCH -tip -v12 07/11] tracing: Introduce TRACE_FIELD_ZERO() macro

2009-07-16 Thread Masami Hiramatsu
Use TRACE_FIELD_ZERO(type, item) instead of TRACE_FIELD_ZERO_CHAR(item).
This also includes a fix of TRACE_ZERO_CHAR() macro.

Signed-off-by: Masami Hiramatsu mhira...@redhat.com
Cc: Steven Rostedt rost...@goodmis.org
Cc: Ingo Molnar mi...@elte.hu
Cc: Tom Zanussi tzanu...@gmail.com
Cc: Frederic Weisbecker fweis...@gmail.com
---

 kernel/trace/trace_event_types.h |4 ++--
 kernel/trace/trace_export.c  |   16 
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 6db005e..e74f090 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -109,7 +109,7 @@ TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, 
ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, ip, ip)
TRACE_FIELD(char *, fmt, fmt)
-   TRACE_FIELD_ZERO_CHAR(buf)
+   TRACE_FIELD_ZERO(char, buf)
),
TP_RAW_FMT(%08lx (%d) fmt:%p %s)
 );
@@ -117,7 +117,7 @@ TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, 
ignore,
 TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, ip, ip)
-   TRACE_FIELD_ZERO_CHAR(buf)
+   TRACE_FIELD_ZERO(char, buf)
),
TP_RAW_FMT(%08lx (%d) fmt:%p %s)
 );
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7cee79d..23125b5 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -42,9 +42,9 @@ extern void __bad_type_size(void);
if (!ret)   \
return 0;
 
-#undef TRACE_FIELD_ZERO_CHAR
-#define TRACE_FIELD_ZERO_CHAR(item)\
-   ret = trace_seq_printf(s, \tfield:char  #item ;\t   \
+#undef TRACE_FIELD_ZERO
+#define TRACE_FIELD_ZERO(type, item)   \
+   ret = trace_seq_printf(s, \tfield: #type   #item ;\t  \
   offset:%u;\tsize:0;\n, \
   (unsigned int)offsetof(typeof(field), item)); \
if (!ret)   \
@@ -90,9 +90,6 @@ ftrace_format_##call(struct ftrace_event_call *dummy, struct 
trace_seq *s)\
 
 #include trace_event_types.h
 
-#undef TRACE_ZERO_CHAR
-#define TRACE_ZERO_CHAR(arg)
-
 #undef TRACE_FIELD
 #define TRACE_FIELD(type, item, assign)\
entry-item = assign;
@@ -105,6 +102,9 @@ ftrace_format_##call(struct ftrace_event_call *dummy, 
struct trace_seq *s)\
 #define TRACE_FIELD_SIGN(type, item, assign, is_signed)\
TRACE_FIELD(type, item, assign)
 
+#undef TRACE_FIELD_ZERO
+#define TRACE_FIELD_ZERO(type, item)
+
 #undef TP_CMD
 #define TP_CMD(cmd...) cmd
 
@@ -176,8 +176,8 @@ __attribute__((section(_ftrace_events))) event_##call = { 
\
if (ret)\
return ret;
 
-#undef TRACE_FIELD_ZERO_CHAR
-#define TRACE_FIELD_ZERO_CHAR(item)
+#undef TRACE_FIELD_ZERO
+#define TRACE_FIELD_ZERO(type, item)
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \


-- 
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: mhira...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH -tip -v12 05/11] x86: add pt_regs register and stack access APIs

2009-07-16 Thread Masami Hiramatsu
Add following APIs for accessing registers and stack entries from pt_regs.
These APIs are required by kprobes-based event tracer on ftrace.
Some other debugging tools might be able to use it too.

- regs_query_register_offset(const char *name)
   Query the offset of name register.

- regs_query_register_name(unsigned int offset)
   Query the name of register by its offset.

- regs_get_register(struct pt_regs *regs, unsigned int offset)
   Get the value of a register by its offset.

- regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr)
   Check the address is in the kernel stack.

- regs_get_kernel_stack_nth(struct pt_regs *reg, unsigned int nth)
   Get Nth entry of the kernel stack. (N = 0)

- regs_get_argument_nth(struct pt_regs *reg, unsigned int nth)
   Get Nth argument at function call. (N = 0)

Changes from v10:
 - Use an offsetof table in regs_get_argument_nth().
 - Use unsigned int instead of unsigned.

Signed-off-by: Masami Hiramatsu mhira...@redhat.com
Reviewed-by: Frederic Weisbecker fweis...@gmail.com
Cc: Andi Kleen a...@firstfloor.org
Cc: Christoph Hellwig h...@infradead.org
Cc: Steven Rostedt rost...@goodmis.org
Cc: Ananth N Mavinakayanahalli ana...@in.ibm.com
Cc: Ingo Molnar mi...@elte.hu
Cc: Frederic Weisbecker fweis...@gmail.com
Cc: Roland McGrath rol...@redhat.com
Cc: Srikar Dronamraju sri...@linux.vnet.ibm.com
Cc: linux-a...@vger.kernel.org
---

 arch/x86/include/asm/ptrace.h |   62 +++
 arch/x86/kernel/ptrace.c  |  112 +
 2 files changed, 174 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 0f0d908..a3d49dd 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -7,6 +7,7 @@
 
 #ifdef __KERNEL__
 #include asm/segment.h
+#include asm/page_types.h
 #endif
 
 #ifndef __ASSEMBLY__
@@ -216,6 +217,67 @@ static inline unsigned long user_stack_pointer(struct 
pt_regs *regs)
return regs-sp;
 }
 
+/* Query offset/name of register from its name/offset */
+extern int regs_query_register_offset(const char *name);
+extern const char *regs_query_register_name(unsigned int offset);
+#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
+
+/**
+ * regs_get_register() - get register value from its offset
+ * @regs:  pt_regs from which register value is gotten.
+ * @offset:offset number of the register.
+ *
+ * regs_get_register returns the value of a register whose offset from @regs
+ * is @offset. The @offset is the offset of the register in struct pt_regs.
+ * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
+ */
+static inline unsigned long regs_get_register(struct pt_regs *regs,
+ unsigned int offset)
+{
+   if (unlikely(offset  MAX_REG_OFFSET))
+   return 0;
+   return *(unsigned long *)((unsigned long)regs + offset);
+}
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs:  pt_regs which contains kernel stack pointer.
+ * @addr:  address which is checked.
+ *
+ * regs_within_kenel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static inline int regs_within_kernel_stack(struct pt_regs *regs,
+  unsigned long addr)
+{
+   return ((addr  ~(THREAD_SIZE - 1))  ==
+   (kernel_stack_pointer(regs)  ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs:  pt_regs which contains kernel stack pointer.
+ * @n: stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specifined by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
+ unsigned int n)
+{
+   unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+   addr += n;
+   if (regs_within_kernel_stack(regs, (unsigned long)addr))
+   return *addr;
+   else
+   return 0;
+}
+
+/* Get Nth argument at function call */
+extern unsigned long regs_get_argument_nth(struct pt_regs *regs,
+  unsigned int n);
+
 /*
  * These are defined as per linux/ptrace.h, which see.
  */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index cabdabc..32729ec 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -49,6 +49,118 @@ enum x86_regset {
REGSET_IOPERM32,
 };
 
+struct pt_regs_offset {
+   const char *name;
+   int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+
+static const struct pt_regs_offset regoffset_table[] = {
+#ifdef 

[PATCH -tip -v12 09/11] tracing: Kprobe-tracer supports more than 6 arguments

2009-07-16 Thread Masami Hiramatsu
Support up to 128 arguments for each kprobes event.

Signed-off-by: Masami Hiramatsu mhira...@redhat.com
Cc: Ananth N Mavinakayanahalli ana...@in.ibm.com
Cc: Christoph Hellwig h...@infradead.org
Cc: Steven Rostedt rost...@goodmis.org
Cc: Ingo Molnar mi...@elte.hu
Cc: Frederic Weisbecker fweis...@gmail.com
Cc: Tom Zanussi tzanu...@gmail.com
---

 Documentation/trace/kprobetrace.txt |2 +-
 kernel/trace/trace_kprobe.c |   21 +
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/Documentation/trace/kprobetrace.txt 
b/Documentation/trace/kprobetrace.txt
index 9ad907c..b29a54b 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -32,7 +32,7 @@ Synopsis of kprobe_events
  SYMBOL[+offs|-offs]   : Symbol+offset where the probe is inserted.
  MEMADDR   : Address where the probe is inserted.
 
- FETCHARGS : Arguments.
+ FETCHARGS : Arguments. Each probe can have up to 128 args.
   %REG : Fetch register REG
   sN   : Fetch Nth entry of stack (N = 0)
   @ADDR: Fetch memory at ADDR (ADDR should be in kernel)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ad33073..67c33e1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -32,7 +32,7 @@
 #include trace.h
 #include trace_output.h
 
-#define TRACE_KPROBE_ARGS 6
+#define MAX_TRACE_ARGS 128
 #define MAX_ARGSTR_LEN 63
 
 /* currently, trace_kprobe only supports X86. */
@@ -178,11 +178,15 @@ struct trace_probe {
struct kretproberp;
};
const char  *symbol;/* symbol name */
-   unsigned intnr_args;
-   struct fetch_func   args[TRACE_KPROBE_ARGS];
struct ftrace_event_callcall;
+   unsigned intnr_args;
+   struct fetch_func   args[];
 };
 
+#define SIZEOF_TRACE_PROBE(n)  \
+   (offsetof(struct trace_probe, args) +   \
+   (sizeof(struct fetch_func) * (n)))
+
 static int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs);
 static int kretprobe_trace_func(struct kretprobe_instance *ri,
struct pt_regs *regs);
@@ -255,11 +259,11 @@ static DEFINE_MUTEX(probe_lock);
 static LIST_HEAD(probe_list);
 
 static struct trace_probe *alloc_trace_probe(const char *symbol,
-const char *event)
+const char *event, int nargs)
 {
struct trace_probe *tp;
 
-   tp = kzalloc(sizeof(struct trace_probe), GFP_KERNEL);
+   tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
if (!tp)
return ERR_PTR(-ENOMEM);
 
@@ -559,9 +563,10 @@ static int create_trace_probe(int argc, char **argv)
if (offset  is_return)
return -EINVAL;
}
+   argc -= 2; argv += 2;
 
/* setup a probe */
-   tp = alloc_trace_probe(symbol, event);
+   tp = alloc_trace_probe(symbol, event, argc);
if (IS_ERR(tp))
return PTR_ERR(tp);
 
@@ -580,8 +585,8 @@ static int create_trace_probe(int argc, char **argv)
kp-addr = addr;
 
/* parse arguments */
-   argc -= 2; argv += 2; ret = 0;
-   for (i = 0; i  argc  i  TRACE_KPROBE_ARGS; i++) {
+   ret = 0;
+   for (i = 0; i  argc  i  MAX_TRACE_ARGS; i++) {
if (strlen(argv[i])  MAX_ARGSTR_LEN) {
pr_info(Argument%d(%s) is too long.\n, i, argv[i]);
ret = -ENOSPC;


-- 
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: mhira...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH -tip -v12 10/11] tracing: Generate names for each kprobe event automatically

2009-07-16 Thread Masami Hiramatsu
Generate names for each kprobe event based on the probe point,
and remove generic k*probe event types because there is no user
of those types.

Signed-off-by: Masami Hiramatsu mhira...@redhat.com
Cc: Ananth N Mavinakayanahalli ana...@in.ibm.com
Cc: Christoph Hellwig h...@infradead.org
Cc: Steven Rostedt rost...@goodmis.org
Cc: Ingo Molnar mi...@elte.hu
Cc: Frederic Weisbecker fweis...@gmail.com
Cc: Tom Zanussi tzanu...@gmail.com
---

 Documentation/trace/kprobetrace.txt |3 +-
 kernel/trace/trace_event_types.h|   18 --
 kernel/trace/trace_kprobe.c |   64 ++-
 3 files changed, 35 insertions(+), 50 deletions(-)

diff --git a/Documentation/trace/kprobetrace.txt 
b/Documentation/trace/kprobetrace.txt
index b29a54b..437ad49 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -28,7 +28,8 @@ Synopsis of kprobe_events
   p[:EVENT] SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS]: Set a probe
   r[:EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe
 
- EVENT : Event name.
+ EVENT : Event name. If omitted, the event name is generated
+ based on SYMBOL+offs or MEMADDR.
  SYMBOL[+offs|-offs]   : Symbol+offset where the probe is inserted.
  MEMADDR   : Address where the probe is inserted.
 
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 186b598..e74f090 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -175,22 +175,4 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, 
kmemtrace_free_entry, ignore,
TP_RAW_FMT(type:%u call_site:%lx ptr:%p)
 );
 
-TRACE_EVENT_FORMAT(kprobe, TRACE_KPROBE, kprobe_trace_entry, ignore,
-   TRACE_STRUCT(
-   TRACE_FIELD(unsigned long, ip, ip)
-   TRACE_FIELD(int, nargs, nargs)
-   TRACE_FIELD_ZERO(unsigned long, args)
-   ),
-   TP_RAW_FMT(%08lx: args:0x%lx ...)
-);
-
-TRACE_EVENT_FORMAT(kretprobe, TRACE_KRETPROBE, kretprobe_trace_entry, ignore,
-   TRACE_STRUCT(
-   TRACE_FIELD(unsigned long, func, func)
-   TRACE_FIELD(unsigned long, ret_ip, ret_ip)
-   TRACE_FIELD(int, nargs, nargs)
-   TRACE_FIELD_ZERO(unsigned long, args)
-   ),
-   TP_RAW_FMT(%08lx - %08lx: args:0x%lx ...)
-);
 #undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 67c33e1..3444d1d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -34,6 +34,7 @@
 
 #define MAX_TRACE_ARGS 128
 #define MAX_ARGSTR_LEN 63
+#define MAX_EVENT_NAME_LEN 64
 
 /* currently, trace_kprobe only supports X86. */
 
@@ -272,11 +273,11 @@ static struct trace_probe *alloc_trace_probe(const char 
*symbol,
if (!tp-symbol)
goto error;
}
-   if (event) {
-   tp-call.name = kstrdup(event, GFP_KERNEL);
-   if (!tp-call.name)
-   goto error;
-   }
+   if (!event)
+   goto error;
+   tp-call.name = kstrdup(event, GFP_KERNEL);
+   if (!tp-call.name)
+   goto error;
 
INIT_LIST_HEAD(tp-list);
return tp;
@@ -306,7 +307,7 @@ static struct trace_probe *find_probe_event(const char 
*event)
struct trace_probe *tp;
 
list_for_each_entry(tp, probe_list, list)
-   if (tp-call.name  !strcmp(tp-call.name, event))
+   if (!strcmp(tp-call.name, event))
return tp;
return NULL;
 }
@@ -322,8 +323,7 @@ static void __unregister_trace_probe(struct trace_probe *tp)
 /* Unregister a trace_probe and probe_event: call with locking probe_lock */
 static void unregister_trace_probe(struct trace_probe *tp)
 {
-   if (tp-call.name)
-   unregister_probe_event(tp);
+   unregister_probe_event(tp);
__unregister_trace_probe(tp);
list_del(tp-list);
 }
@@ -352,18 +352,16 @@ static int register_trace_probe(struct trace_probe *tp)
goto end;
}
/* register as an event */
-   if (tp-call.name) {
-   old_tp = find_probe_event(tp-call.name);
-   if (old_tp) {
-   /* delete old event */
-   unregister_trace_probe(old_tp);
-   free_trace_probe(old_tp);
-   }
-   ret = register_probe_event(tp);
-   if (ret) {
-   pr_warning(Faild to register probe event(%d)\n, ret);
-   __unregister_trace_probe(tp);
-   }
+   old_tp = find_probe_event(tp-call.name);
+   if (old_tp) {
+   /* delete old event */
+   unregister_trace_probe(old_tp);
+   free_trace_probe(old_tp);
+   }
+   ret = register_probe_event(tp);
+   if (ret) {
+   pr_warning(Faild to 

[PATCH -tip -v12 00/11] tracing: kprobe-based event tracer and x86 instruction decoder

2009-07-16 Thread Masami Hiramatsu
Hi,

Here are the v12 patches. I updated it for the latest -tip and
add fix some bugs.

Here are the patches of kprobe-based event tracer for x86, version 12,
which allows you to probe various kernel events through ftrace interface.
The tracer supports per-probe filtering which allows you to set filters
on each probe and shows formats of each probe. I think this is more
generic integration with ftrace, especially event-tracer.

This version includes below small fixes.
 - Fix a buffer overflow bug. (PATCH 8/11)
 - Fix indirect memory access string bug. (PATCH 8/11)
 - Remove subsystem event directory if it is empty. (PATCH 6/11)
 - Cleanup code an remove redundant checks. (PATCH 8/11, 11/11)

This patchset also includes x86(-64) instruction decoder which
supports non-SSE/FP opcodes and includes x86 opcode map. The decoder
is used for finding the instruction boundaries when inserting new
kprobes. I think it will be possible to share this opcode map
with KVM's decoder.
The decoder is tested when building kernel, the test compares the 
results of objdump and the decoder right after building vmlinux.
You can enable that test by CONFIG_X86_DECODER_SELFTEST=y.

This series can be applied on the latest linux-2.6.31-rc3-tip.

This supports only x86(-32/-64) (but porting it on other arch
just needs kprobes/kretprobes and register and stack access APIs).


Enhancement ideas will be added after merging:
- Make a stress test of kprobes on this tracer.
  (see http://sources.redhat.com/ml/systemtap/2009-q2/msg01055.html)
- Easy probe setting wrapper which analyzes dwarf..
- .init function tracing support.
- Support primitive types(long, ulong, int, uint, etc) for args.


Kprobe-based Event Tracer
=

Overview

This tracer is similar to the events tracer which is based on Tracepoint
infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe
and kretprobe). It probes anywhere where kprobes can probe(this means, all
functions body except for __kprobes functions).

Unlike the function tracer, this tracer can probe instructions inside of
kernel functions. It allows you to check which instruction has been executed.

Unlike the Tracepoint based events tracer, this tracer can add new probe points
on the fly.

Similar to the events tracer, this tracer doesn't need to be activated via
current_tracer, instead of that, just set probe points via
/sys/kernel/debug/tracing/kprobe_events. And you can set filters on each
probe events via /sys/kernel/debug/tracing/events/kprobes/EVENT/filter.


Synopsis of kprobe_events
-
  p[:EVENT] SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS] : Set a probe
  r[:EVENT] SYMBOL[+0] [FETCHARGS]  : Set a return probe

 EVENT  : Event name. If omitted, the event name is generated
  based on SYMBOL+offs or MEMADDR.
 SYMBOL[+offs|-offs]: Symbol+offset where the probe is inserted.
 MEMADDR: Address where the probe is inserted.

 FETCHARGS  : Arguments. Each probe can have up to 128 args.
  %REG  : Fetch register REG
  sN: Fetch Nth entry of stack (N = 0)
  @ADDR : Fetch memory at ADDR (ADDR should be in kernel)
  @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
  aN: Fetch function argument. (N = 0)(*)
  rv: Fetch return value.(**)
  ra: Fetch return address.(**)
  +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.(***)

  (*) aN may not correct on asmlinkaged functions and at the middle of
  function body.
  (**) only for return probe.
  (***) this is useful for fetching a field of data structures.


Per-Probe Event Filtering
-
 Per-probe event filtering feature allows you to set different filter on each
probe and gives you what arguments will be shown in trace buffer. If an event
name is specified right after 'p:' or 'r:' in kprobe_events, the tracer adds
an event under tracing/events/kprobes/EVENT, at the directory you can see
'id', 'enabled', 'format' and 'filter'.

enabled:
  You can enable/disable the probe by writing 1 or 0 on it.

format:
  It shows the format of this probe event. It also shows aliases of arguments
 which you specified to kprobe_events.

filter:
  You can write filtering rules of this event. And you can use both of aliase
 names and field names for describing filters.


Event Profiling
---
 You can check the total number of probe hits and probe miss-hits via
/sys/kernel/debug/tracing/kprobe_profile.
 The first column is event name, the second is the number of probe hits,
the third is the number of probe miss-hits.


Usage examples
--
To add a probe as a new event, write a new definition to kprobe_events
as below.

  echo p:myprobe do_sys_open a0 a1 a2 a3  
/sys/kernel/debug/tracing/kprobe_events

 This sets a kprobe on the top of do_sys_open() function with recording
1st to 4th arguments as myprobe event.

  echo 

[PATCH -tip -v12 11/11] tracing: Add kprobes event profiling interface

2009-07-16 Thread Masami Hiramatsu
Add profiling interaces for each kprobes event.

Changes from v11:
 - Fix a typo and remove redundant check.

Signed-off-by: Masami Hiramatsu mhira...@redhat.com
Cc: Ananth N Mavinakayanahalli ana...@in.ibm.com
Cc: Christoph Hellwig h...@infradead.org
Cc: Steven Rostedt rost...@goodmis.org
Cc: Ingo Molnar mi...@elte.hu
Cc: Frederic Weisbecker fweis...@gmail.com
Cc: Tom Zanussi tzanu...@gmail.com
Cc: Li Zefan l...@cn.fujitsu.com
---

 Documentation/trace/kprobetrace.txt |8 ++
 kernel/trace/trace_kprobe.c |   45 +++
 2 files changed, 53 insertions(+), 0 deletions(-)

diff --git a/Documentation/trace/kprobetrace.txt 
b/Documentation/trace/kprobetrace.txt
index 437ad49..9c6be05 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -69,6 +69,14 @@ filter:
  names and field names for describing filters.
 
 
+Event Profiling
+---
+ You can check the total number of probe hits and probe miss-hits via
+/sys/kernel/debug/tracing/kprobe_profile.
+ The first column is event name, the second is the number of probe hits,
+the third is the number of probe miss-hits.
+
+
 Usage examples
 --
 To add a probe as a new event, write a new definition to kprobe_events
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3444d1d..21e619f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -178,6 +178,7 @@ struct trace_probe {
struct kprobe   kp;
struct kretproberp;
};
+   unsigned long   nhits;
const char  *symbol;/* symbol name */
struct ftrace_event_callcall;
unsigned intnr_args;
@@ -766,6 +767,39 @@ static const struct file_operations kprobe_events_ops = {
.write  = probes_write,
 };
 
+/* Probes profiling interfaces */
+static int profile_seq_show(struct seq_file *m, void *v)
+{
+   struct trace_probe *tp = v;
+
+   seq_printf(m, %s, tp-call.name);
+
+   seq_printf(m, \t%8lu %8lu\n, tp-nhits,
+  probe_is_return(tp) ? tp-rp.kp.nmissed : tp-kp.nmissed);
+
+   return 0;
+}
+
+static const struct seq_operations profile_seq_op = {
+   .start  = probes_seq_start,
+   .next   = probes_seq_next,
+   .stop   = probes_seq_stop,
+   .show   = profile_seq_show
+};
+
+static int profile_open(struct inode *inode, struct file *file)
+{
+   return seq_open(file, profile_seq_op);
+}
+
+static const struct file_operations kprobe_profile_ops = {
+   .owner  = THIS_MODULE,
+   .open   = profile_open,
+   .read   = seq_read,
+   .llseek = seq_lseek,
+   .release= seq_release,
+};
+
 /* Kprobe handler */
 static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 {
@@ -776,6 +810,8 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, 
struct pt_regs *regs)
unsigned long irq_flags;
struct ftrace_event_call *call = tp-call;
 
+   tp-nhits++;
+
local_save_flags(irq_flags);
pc = preempt_count();
 
@@ -1152,9 +1188,18 @@ static __init int init_kprobe_trace(void)
entry = debugfs_create_file(kprobe_events, 0644, d_tracer,
NULL, kprobe_events_ops);
 
+   /* Event list interface */
if (!entry)
pr_warning(Could not create debugfs 
   'kprobe_events' entry\n);
+
+   /* Profile interface */
+   entry = debugfs_create_file(kprobe_profile, 0444, d_tracer,
+   NULL, kprobe_profile_ops);
+
+   if (!entry)
+   pr_warning(Could not create debugfs 
+  'kprobe_profile' entry\n);
return 0;
 }
 fs_initcall(init_kprobe_trace);


-- 
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: mhira...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH -tip -v12 02/11] x86: x86 instruction decoder build-time selftest

2009-07-16 Thread Masami Hiramatsu
Add a user-space selftest of x86 instruction decoder at kernel build time.
When CONFIG_X86_DECODER_SELFTEST=y, Kbuild builds a test harness of x86
instruction decoder and performs it after building vmlinux.
The test compares the results of objdump and x86 instruction decoder
code and check there are no differences.

Changes from v10:
 - Use unsigned int instead of unsigned.

Signed-off-by: Masami Hiramatsu mhira...@redhat.com
Signed-off-by: Jim Keniston jkeni...@us.ibm.com
Cc: H. Peter Anvin h...@zytor.com
Cc: Steven Rostedt rost...@goodmis.org
Cc: Ananth N Mavinakayanahalli ana...@in.ibm.com
Cc: Srikar Dronamraju sri...@linux.vnet.ibm.com
Cc: Ingo Molnar mi...@elte.hu
Cc: Frederic Weisbecker fweis...@gmail.com
Cc: Andi Kleen a...@linux.intel.com
Cc: Vegard Nossum vegard.nos...@gmail.com
Cc: Avi Kivity a...@redhat.com
Cc: Przemysław Pawełczyk przemys...@pawelczyk.it
Cc: Sam Ravnborg s...@ravnborg.org
---

 arch/x86/Kconfig.debug  |9 
 arch/x86/Makefile   |3 +
 arch/x86/include/asm/inat.h |2 +
 arch/x86/include/asm/insn.h |2 +
 arch/x86/lib/inat.c |2 +
 arch/x86/lib/insn.c |2 +
 arch/x86/scripts/Makefile   |   19 +++
 arch/x86/scripts/distill.awk|   42 +
 arch/x86/scripts/test_get_len.c |   99 +++
 arch/x86/scripts/user_include.h |   49 +++
 10 files changed, 229 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/scripts/Makefile
 create mode 100644 arch/x86/scripts/distill.awk
 create mode 100644 arch/x86/scripts/test_get_len.c
 create mode 100644 arch/x86/scripts/user_include.h

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d105f29..7d0b681 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -186,6 +186,15 @@ config X86_DS_SELFTEST
 config HAVE_MMIOTRACE_SUPPORT
def_bool y
 
+config X86_DECODER_SELFTEST
+ bool x86 instruction decoder selftest
+ depends on DEBUG_KERNEL
+   ---help---
+Perform x86 instruction decoder selftests at build time.
+This option is useful for checking the sanity of x86 instruction
+decoder code.
+If unsure, say N.
+
 #
 # IO delay types:
 #
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1b68659..7046556 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -154,6 +154,9 @@ all: bzImage
 KBUILD_IMAGE := $(boot)/bzImage
 
 bzImage: vmlinux
+ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
+   $(Q)$(MAKE) $(build)=arch/x86/scripts posttest
+endif
$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
$(Q)ln -fsn ../../x86/boot/bzImage 
$(objtree)/arch/$(UTS_MACHINE)/boot/$@
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
index 01e079a..9090665 100644
--- a/arch/x86/include/asm/inat.h
+++ b/arch/x86/include/asm/inat.h
@@ -20,7 +20,9 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  */
+#ifdef __KERNEL__
 #include linux/types.h
+#endif
 
 /* Instruction attributes */
 typedef u32 insn_attr_t;
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 5b50fa3..5736404 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -20,7 +20,9 @@
  * Copyright (C) IBM Corporation, 2009
  */
 
+#ifdef __KERNEL__
 #include linux/types.h
+#endif
 /* insn_attr_t is defined in inat.h */
 #include asm/inat.h
 
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
index d6a34be..564ecbd 100644
--- a/arch/x86/lib/inat.c
+++ b/arch/x86/lib/inat.c
@@ -18,7 +18,9 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  */
+#ifdef __KERNEL__
 #include linux/module.h
+#endif
 #include asm/insn.h
 
 /* Attribute tables are generated from opcode map */
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 254c848..3b9451a 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -18,8 +18,10 @@
  * Copyright (C) IBM Corporation, 2002, 2004, 2009
  */
 
+#ifdef __KERNEL__
 #include linux/string.h
 #include linux/module.h
+#endif
 #include asm/inat.h
 #include asm/insn.h
 
diff --git a/arch/x86/scripts/Makefile b/arch/x86/scripts/Makefile
new file mode 100644
index 000..f08859e
--- /dev/null
+++ b/arch/x86/scripts/Makefile
@@ -0,0 +1,19 @@
+PHONY += posttest
+quiet_cmd_posttest = TEST$@
+  cmd_posttest = objdump -d $(objtree)/vmlinux | awk -f 
$(srctree)/arch/x86/scripts/distill.awk | $(obj)/test_get_len
+
+posttest: $(obj)/test_get_len vmlinux
+   $(call cmd,posttest)
+
+test_get_len_SRC = $(srctree)/arch/x86/scripts/test_get_len.c 
$(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c
+test_get_len_INC = $(srctree)/arch/x86/include/asm/inat.h 
$(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+
+quiet_cmd_test_get_len = CC  $@
+  cmd_test_get_len = $(CC) -Wall $(test_get_len_SRC) 

[PATCH -tip -v12 08/11] tracing: add kprobe-based event tracer

2009-07-16 Thread Masami Hiramatsu
Add kprobes-based event tracer on ftrace.

This tracer is similar to the events tracer which is based on Tracepoint
infrastructure. Instead of Tracepoint, this tracer is based on kprobes
(kprobe and kretprobe). It probes anywhere where kprobes can probe(this
 means, all functions body except for __kprobes functions).

Similar to the events tracer, this tracer doesn't need to be activated via
current_tracer, instead of that, just set probe points via
/sys/kernel/debug/tracing/kprobe_events. And you can set filters on each
probe events via /sys/kernel/debug/tracing/events/kprobes/EVENT/filter.

This tracer supports following probe arguments for each probe.

  %REG  : Fetch register REG
  sN: Fetch Nth entry of stack (N = 0)
  @ADDR : Fetch memory at ADDR (ADDR should be in kernel)
  @SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
  aN: Fetch function argument. (N = 0)
  rv: Fetch return value.
  ra: Fetch return address.
  +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.

See Documentation/trace/kprobetrace.txt for details.

Changes from v11:
 - Put a line after local variable definitions.
 - Fix indirect memory access string bug in trace_arg_string().
 - Remove redundant checks.
 - Fix buffer overflow in probes_write().
 - Fix probes_write() to support inputs ended without a new-line.

Signed-off-by: Masami Hiramatsu mhira...@redhat.com
Acked-by: Ananth N Mavinakayanahalli ana...@in.ibm.com
Cc: Christoph Hellwig h...@infradead.org
Cc: Steven Rostedt rost...@goodmis.org
Cc: Ingo Molnar mi...@elte.hu
Cc: Frederic Weisbecker fweis...@gmail.com
Cc: Tom Zanussi tzanu...@gmail.com
Cc: Li Zefan l...@cn.fujitsu.com
---

 Documentation/trace/kprobetrace.txt |  138 
 kernel/trace/Kconfig|   12 
 kernel/trace/Makefile   |1 
 kernel/trace/trace.h|   29 +
 kernel/trace/trace_event_types.h|   18 +
 kernel/trace/trace_kprobe.c | 1193 +++
 6 files changed, 1391 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/trace/kprobetrace.txt
 create mode 100644 kernel/trace/trace_kprobe.c

diff --git a/Documentation/trace/kprobetrace.txt 
b/Documentation/trace/kprobetrace.txt
new file mode 100644
index 000..9ad907c
--- /dev/null
+++ b/Documentation/trace/kprobetrace.txt
@@ -0,0 +1,138 @@
+ Kprobe-based Event Tracer
+ =
+
+ Documentation is written by Masami Hiramatsu
+
+
+Overview
+
+This tracer is similar to the events tracer which is based on Tracepoint
+infrastructure. Instead of Tracepoint, this tracer is based on kprobes(kprobe
+and kretprobe). It probes anywhere where kprobes can probe(this means, all
+functions body except for __kprobes functions).
+
+Unlike the function tracer, this tracer can probe instructions inside of
+kernel functions. It allows you to check which instruction has been executed.
+
+Unlike the Tracepoint based events tracer, this tracer can add and remove
+probe points on the fly.
+
+Similar to the events tracer, this tracer doesn't need to be activated via
+current_tracer, instead of that, just set probe points via
+/sys/kernel/debug/tracing/kprobe_events. And you can set filters on each
+probe events via /sys/kernel/debug/tracing/events/kprobes/EVENT/filter.
+
+
+Synopsis of kprobe_events
+-
+  p[:EVENT] SYMBOL[+offs|-offs]|MEMADDR [FETCHARGS]: Set a probe
+  r[:EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe
+
+ EVENT : Event name.
+ SYMBOL[+offs|-offs]   : Symbol+offset where the probe is inserted.
+ MEMADDR   : Address where the probe is inserted.
+
+ FETCHARGS : Arguments.
+  %REG : Fetch register REG
+  sN   : Fetch Nth entry of stack (N = 0)
+  @ADDR: Fetch memory at ADDR (ADDR should be in kernel)
+  @SYM[+|-offs]: Fetch memory at SYM +|- offs (SYM should be a data 
symbol)
+  aN   : Fetch function argument. (N = 0)(*)
+  rv   : Fetch return value.(**)
+  ra   : Fetch return address.(**)
+  +|-offs(FETCHARG) : fetch memory at FETCHARG +|- offs address.(***)
+
+  (*) aN may not correct on asmlinkaged functions and at the middle of
+  function body.
+  (**) only for return probe.
+  (***) this is useful for fetching a field of data structures.
+
+
+Per-Probe Event Filtering
+-
+ Per-probe event filtering feature allows you to set different filter on each
+probe and gives you what arguments will be shown in trace buffer. If an event
+name is specified right after 'p:' or 'r:' in kprobe_events, the tracer adds
+an event under tracing/events/kprobes/EVENT, at the directory you can see
+'id', 'enabled', 'format' and 'filter'.
+
+enabled:
+  You can enable/disable the probe by writing 1 or 0 on it.
+
+format:
+  It shows the format of this probe event. It also shows aliases of arguments
+ which you specified 

[PATCH -tip -v12 03/11] kprobes: checks probe address is instruction boudary on x86

2009-07-16 Thread Masami Hiramatsu
Ensure safeness of inserting kprobes by checking whether the specified
address is at the first byte of a instruction on x86.
This is done by decoding probed function from its head to the probe point.

Signed-off-by: Masami Hiramatsu mhira...@redhat.com
Acked-by: Ananth N Mavinakayanahalli ana...@in.ibm.com
Cc: Jim Keniston jkeni...@us.ibm.com
Cc: Ingo Molnar mi...@elte.hu
---

 arch/x86/kernel/kprobes.c |   69 +
 1 files changed, 69 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b5b1848..5341842 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -48,6 +48,7 @@
 #include linux/preempt.h
 #include linux/module.h
 #include linux/kdebug.h
+#include linux/kallsyms.h
 
 #include asm/cacheflush.h
 #include asm/desc.h
@@ -55,6 +56,7 @@
 #include asm/uaccess.h
 #include asm/alternative.h
 #include asm/debugreg.h
+#include asm/insn.h
 
 void jprobe_return_end(void);
 
@@ -245,6 +247,71 @@ retry:
}
 }
 
+/* Recover the probed instruction at addr for further analysis. */
+static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+   struct kprobe *kp;
+   kp = get_kprobe((void *)addr);
+   if (!kp)
+   return -EINVAL;
+
+   /*
+*  Basically, kp-ainsn.insn has an original instruction.
+*  However, RIP-relative instruction can not do single-stepping
+*  at different place, fix_riprel() tweaks the displacement of
+*  that instruction. In that case, we can't recover the instruction
+*  from the kp-ainsn.insn.
+*
+*  On the other hand, kp-opcode has a copy of the first byte of
+*  the probed instruction, which is overwritten by int3. And
+*  the instruction at kp-addr is not modified by kprobes except
+*  for the first byte, we can recover the original instruction
+*  from it and kp-opcode.
+*/
+   memcpy(buf, kp-addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+   buf[0] = kp-opcode;
+   return 0;
+}
+
+/* Dummy buffers for kallsyms_lookup */
+static char __dummy_buf[KSYM_NAME_LEN];
+
+/* Check if paddr is at an instruction boundary */
+static int __kprobes can_probe(unsigned long paddr)
+{
+   int ret;
+   unsigned long addr, offset = 0;
+   struct insn insn;
+   kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+   if (!kallsyms_lookup(paddr, NULL, offset, NULL, __dummy_buf))
+   return 0;
+
+   /* Decode instructions */
+   addr = paddr - offset;
+   while (addr  paddr) {
+   kernel_insn_init(insn, (void *)addr);
+   insn_get_opcode(insn);
+
+   /* Check if the instruction has been modified. */
+   if (OPCODE1(insn) == BREAKPOINT_INSTRUCTION) {
+   ret = recover_probed_instruction(buf, addr);
+   if (ret)
+   /*
+* Another debugging subsystem might insert
+* this breakpoint. In that case, we can't
+* recover it.
+*/
+   return 0;
+   kernel_insn_init(insn, buf);
+   }
+   insn_get_length(insn);
+   addr += insn.length;
+   }
+
+   return (addr == paddr);
+}
+
 /*
  * Returns non-zero if opcode modifies the interrupt flag.
  */
@@ -360,6 +427,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
 
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
+   if (!can_probe((unsigned long)p-addr))
+   return -EILSEQ;
/* insn: must be on special executable page on x86. */
p-ainsn.insn = get_insn_slot();
if (!p-ainsn.insn)


-- 
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: mhira...@redhat.com
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH] KVM: introduce xinterface API for external interaction with guests

2009-07-16 Thread Anthony Liguori

Gregory Haskins wrote:

That said, I think the assumption that was made when I was using this
was that a proper ref for the page was acquired by the gfn_to_page() and
dropped by the caller.  This was always used in the context of a
hypercall/vmexit so presumably the gpa should be considered stable
across that call.  Is that not true?
  


If you're in kvm.ko, then yes, that's a safe assumption to make because 
the guest VCPU cannot run while you are running.


But you're opening this interface to any caller so the VCPU is likely to 
be running while someone calls this function



Regards,
-Greg


  



--
Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -tip -v12 01/11] x86: instruction decoder API

2009-07-16 Thread H. Peter Anvin

Masami Hiramatsu wrote:


These opcode maps do NOT include most of SSE and FP opcodes, because
those opcodes are not used in the kernel.



That is not true.

-hpa
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv4] uio: add generic driver for PCI 2.3 devices

2009-07-16 Thread Greg KH
On Thu, Jul 16, 2009 at 05:12:55PM +0200, Hans J. Koch wrote:
 On Thu, Jul 16, 2009 at 05:07:10PM +0300, Michael S. Tsirkin wrote:
  On Wed, Jul 15, 2009 at 03:08:29PM -0700, Greg KH wrote:
   How about moving that documentation into a place that people will notice
   it, like the rest of the UIO documentation?
  
  Greg,
  
  would it make more sense to add this to
  Documentation/DocBook/uio-howto.xml, or to create
  Documentation/uio_pci_generic.txt ?
 
 Hi Michael,
 I'd prefer to have it in uio-howto.xml so that people only have to look in
 one place. In does not have to be very detailled, just a short explanation
 what this driver is all about and a short example how to use it.

I agree, that would be the best place for it.

thanks,

greg k-h
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -tip -v12 02/11] x86: x86 instruction decoder build-time selftest

2009-07-16 Thread Sam Ravnborg
On Thu, Jul 16, 2009 at 11:57:06AM -0400, Masami Hiramatsu wrote:
 Add a user-space selftest of x86 instruction decoder at kernel build time.
 When CONFIG_X86_DECODER_SELFTEST=y, Kbuild builds a test harness of x86
 instruction decoder and performs it after building vmlinux.
 The test compares the results of objdump and x86 instruction decoder
 code and check there are no differences.

Long overdue review from my side...

  arch/x86/scripts/Makefile   |   19 +++
  arch/x86/scripts/distill.awk|   42 +
  arch/x86/scripts/test_get_len.c |   99 
 +++
  arch/x86/scripts/user_include.h |   49 +++

Hmmm, we have two architectures that uses scripts/ and three that
uses tools/.
I prefer the latter name as what we have ere is beyound what
I generally recognize as a script.

we have scripts/ in top-level and we do not rename this
as we have this hardcoded too many places - but no reason to
use the wrong name here.

 diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
 index 01e079a..9090665 100644
 --- a/arch/x86/include/asm/inat.h
 +++ b/arch/x86/include/asm/inat.h
 @@ -20,7 +20,9 @@
   * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
   *
   */
 +#ifdef __KERNEL__
  #include linux/types.h
 +#endif
  
  /* Instruction attributes */
  typedef u32 insn_attr_t;

Why this?
If you need this to use this file from userspace then could we do some
other trick to make this OK?

I see it repeated several times below.
[If this has already been discussed I have missed it - sorry].


 diff --git a/arch/x86/scripts/Makefile b/arch/x86/scripts/Makefile
 new file mode 100644
 index 000..f08859e
 --- /dev/null
 +++ b/arch/x86/scripts/Makefile
 @@ -0,0 +1,19 @@
 +PHONY += posttest
 +quiet_cmd_posttest = TEST$@
 +  cmd_posttest = objdump -d $(objtree)/vmlinux | awk -f 
 $(srctree)/arch/x86/scripts/distill.awk | $(obj)/test_get_len
 +

You are using the native objdump here.
But I assume this fails miserably when you build x86 on a powerpc host.
In other words - you broke an allyesconfig build for -next...
We have $(OBJDUMP) for this.

 +posttest: $(obj)/test_get_len vmlinux
 + $(call cmd,posttest)
 +
 +test_get_len_SRC = $(srctree)/arch/x86/scripts/test_get_len.c 
 $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c
 +test_get_len_INC = $(srctree)/arch/x86/include/asm/inat.h 
 $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
 +
 +quiet_cmd_test_get_len = CC  $@
 +  cmd_test_get_len = $(CC) -Wall $(test_get_len_SRC) 
 -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include -include 
 $(srctree)/arch/x86/scripts/user_include.h -o $@

Is there a specific reason why you cannot use the standard hostprogs-y for this?
It will take care of dependency tracking etc.
What you have above is a hopeless incomplete list of dependencies.

You need to use HOST_EXTRACFLAGS to set additional -I options and the -include.

 +
 +static void usage()
 +{
 + fprintf(stderr, usage: %s  distilled_disassembly\n, prog);
 + exit(1);
 +}

It would be nice to tell the user what the program is supposed to do.
I know this is a bit unusual but no reason to copy bad practice.

 index 000..3bdcc55
 --- /dev/null
 +++ b/arch/x86/scripts/user_include.h
 @@ -0,0 +1,49 @@
 +#ifndef __USER_TYPES_H
 +#define __USER_TYPES_H
 +
 +/*
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation; either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License
 + * along with this program; if not, write to the Free Software
 + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 + *
 + * Copyright (C) IBM Corporation, 2009
 + */
 +
 +#include string.h
 +
 +#ifdef __x86_64__
 +#define CONFIG_X86_64
 +#else
 +#define CONFIG_X86_32
 +#endif
 +typedef unsigned char u8;
 +typedef unsigned short u16;
 +typedef unsigned int u32;
 +typedef unsigned long long u64;
 +
 +typedef signed char s8;
 +typedef short s16;
 +typedef int s32;
 +typedef long long s64;
 +
 +typedef enum bool { false = 0, true } bool;
 +
 +/* any harmless file-scope decl */
 +#define NOP_DECL struct __nop
 +#define EXPORT_SYMBOL_GPL(symbol) NOP_DECL
 +#define MODULE_LICENSE(gpl) NOP_DECL
 +
 +#define WARN_ON(cond) do { } while (0)
 +#define unlikely(cond) (cond)
 +

So this is a file that alows you to include the other files without dragging in
a massive amount a stuff.
Would be nice if you wrote so in the file so it is explicit.


I tried 

Re: [PATCH -tip -v12 01/11] x86: instruction decoder API

2009-07-16 Thread Sam Ravnborg
 
 diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
 new file mode 100644
 index 000..01e079a
 --- /dev/null
 +++ b/arch/x86/include/asm/inat.h
 @@ -0,0 +1,125 @@
 +#ifndef _ASM_INAT_INAT_H
 +#define _ASM_INAT_INAT_H

[With reference to comment on patch 2/12...]
You create inat.h here.
Could you investigave what is needed to factor out the stuff
needed from userspace so we can avoid the ugly havk where
you redefine types.h?

Maybe create a inat_types.h + inat.h as we do in other cases?

Same for the other files that requred the types.h hack.

Sam

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -tip -v12 01/11] x86: instruction decoder API

2009-07-16 Thread Masami Hiramatsu
On 2009年07月16日 12:19, H. Peter Anvin wrote:
 Masami Hiramatsu wrote:

 These opcode maps do NOT include most of SSE and FP opcodes, because
 those opcodes are not used in the kernel.

 
 That is not true.

Ah, these opcode maps include some SSE/FP setup opcdes which
are used in the kernel.

I've found that opcodes while running selftest of decoder,
so, I checked asm() code and added those in the maps.

Thank you,

-- 
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: mhira...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH] KVM: introduce xinterface API for external interaction with guests

2009-07-16 Thread Arnd Bergmann
On Thursday 16 July 2009, Gregory Haskins wrote:
 Background: The original vbus code was tightly integrated with kvm.ko.  Avi
 suggested that we abstract the interfaces such that it could live outside
 of kvm.

The code is still highly kvm-specific, you would not be able to use
it with another hypervisor like lguest or vmware player, right?

 Example usage: QEMU instantiates a guest, and an external module foo
 that desires the ability to interface with the guest (say via
 open(/dev/foo)).  QEMU may then issue a KVM_GET_VMID operation to acquire
 the u64-based vmid, and pass it to ioctl(foofd, FOO_SET_VMID, vmid).
 Upon receipt, the foo module can issue kvm_xinterface_find(vmid) to acquire
 the proper context.  Internally, the struct kvm* and associated
 struct module* will remain pinned at least until the foo module calls
 kvm_xinterface_put().

Your approach allows passing the vmid from a process that does
not own the kvm context. This looks like an intentional feature,
but I can't see what this gains us. 

 As a final measure, we link the xinterface code statically
 into the kernel so that callers are guaranteed a stable interface to
 kvm_xinterface_find() without implicitly pinning kvm.ko or racing against
 it.

I also don't understand this. Are you worried about driver modules
breaking when an externally-compiled kvm.ko is loaded? The same could
be achieved by defining your data structures kvm_xinterface_ops and
kvm_xinterface in a kernel header that is not shipped by kvm-kmod but
always taken from the kernel headers.
It does not matter if the entry points are build into the kernel or
exported from a kvm.ko as long as you define a fixed ABI.

What is the problem with pinning kvm.ko from another module using
its features?

Can't you simply provide a function call to lookup the kvm context
pointer from the file descriptor to achieve the same functionality?

To take that thought further, maybe the dependency can be turned
around: If every user (pci-uio, virtio-net, ...) exposes a file
descriptor based interface to user space, you can have a kvm
ioctl to register the object behind that file descriptor with
an existing kvm context to associate it with a guest. That would
nicely solve the life time questions by pinning the external
object for the life time of the kvm context rather than the other
way round, and it would be completely separate from kvm in that
each such object could be used by other subsystems independent
of kvm.

Arnd 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv4] uio: add generic driver for PCI 2.3 devices

2009-07-16 Thread Michael S. Tsirkin
On Thu, Jul 16, 2009 at 08:52:08AM -0700, Greg KH wrote:
 On Thu, Jul 16, 2009 at 05:12:55PM +0200, Hans J. Koch wrote:
  On Thu, Jul 16, 2009 at 05:07:10PM +0300, Michael S. Tsirkin wrote:
   On Wed, Jul 15, 2009 at 03:08:29PM -0700, Greg KH wrote:
How about moving that documentation into a place that people will notice
it, like the rest of the UIO documentation?
   
   Greg,
   
   would it make more sense to add this to
   Documentation/DocBook/uio-howto.xml, or to create
   Documentation/uio_pci_generic.txt ?
  
  Hi Michael,
  I'd prefer to have it in uio-howto.xml so that people only have to look in
  one place. In does not have to be very detailled, just a short explanation
  what this driver is all about and a short example how to use it.
 
 I agree, that would be the best place for it.
 
 thanks,
 
 greg k-h

OK. Could you take a look at the following please?

--

doc: Document uio_pci_generic

Signed-off-by: Michael S. Tsirkin m...@redhat.com

---

diff --git a/Documentation/DocBook/uio-howto.tmpl 
b/Documentation/DocBook/uio-howto.tmpl
index 8f6e3b2..4d4ce0e 100644
--- a/Documentation/DocBook/uio-howto.tmpl
+++ b/Documentation/DocBook/uio-howto.tmpl
@@ -25,6 +25,10 @@
year2006-2008/year
holderHans-Jürgen Koch./holder
 /copyright
+copyright
+   year2009/year
+   holderRed Hat Inc, Michael S. Tsirkin (m...@redhat.com)/holder
+/copyright
 
 legalnotice
 para
@@ -42,6 +46,13 @@ GPL version 2.
 
 revhistory
revision
+   revnumber0.9/revnumber
+   date2009-07-16/date
+   authorinitialsmst/authorinitials
+   revremarkAdded generic pci driver
+   /revremark
+   /revision
+   revision
revnumber0.8/revnumber
date2008-12-24/date
authorinitialshjk/authorinitials
@@ -809,6 +820,158 @@ framework to set up sysfs files for this region. Simply 
leave it alone.
 
 /chapter
 
+chapter id=uio_pci_generic xreflabel=Using Generic driver for PCI cards
+?dbhtml filename=uio_pci_generic.html?
+titleGeneric PCI UIO driver/title
+   para
+   The generic driver is a kernel module named uio_pci_generic.
+   It can work with any device compliant to PCI 2.3 (circa 2002) and
+   any compliant PCI Express device. Using this, you only need to
+write the userspace driver, removing the need to write
+a hardware-specific kernel module.
+   /para
+
+sect1 id=uio_pci_generic_binding
+titleMaking the driver recognize the device/title
+   para
+Since the driver does not declare any device ids, it will not get loaded
+automatically and will not automatically bind to any devices, you must load it
+and allocate id to the driver yourself. For example:
+   programlisting
+ modprobe uio_pci_generic
+ echo quot;8086 10f5quot; gt; /sys/bus/pci/drivers/uio_pci_generic/new_id
+   /programlisting
+   /para
+   para
+If there already is a hardware specific kernel driver for your device, the
+generic driver still won't bind to it, in this case if you want to use the
+generic driver (why would you?) you'll have to manually unbind the hardware
+specific driver and bind the generic driver, like this:
+   programlisting
+echo -n :00:19.0 gt; /sys/bus/pci/drivers/e1000e/unbind
+echo -n :00:19.0 gt; /sys/bus/pci/drivers/uio_pci_generic/bind
+   /programlisting
+   /para
+   para
+You can verify that the device has been bound to the driver
+by looking for it in sysfs, for example like the following:
+   programlisting
+ls -l /sys/bus/pci/devices/:00:19.0/driver
+   /programlisting
+Which if successful should print
+   programlisting
+  .../:00:19.0/driver -gt; ../../../bus/pci/drivers/uio_pci_generic
+   /programlisting
+Note that the generic driver will not bind to old PCI 2.2 devices.
+If binding the device failed, run the following command:
+   programlisting
+  dmesg
+   /programlisting
+and look in the output for failure reasons
+   /para
+/sect1
+
+sect1 id=uio_pci_generic_internals
+titleThings to know about uio_pci_generic/title
+   para
+Interrupts are handled using the Interrupt Disable bit in the PCI command
+register and Interrupt Status bit in the PCI status register.  All devices
+compliant to PCI 2.3 (circa 2002) and all compliant PCI Express devices should
+support these bits.  uio_pci_generic detects this support, and won't bind to
+devices which do not support the Interrupt Disable Bit in the command register.
+   /para
+   para
+On each interrupt, uio_pci_generic sets the Interrupt Disable bit.
+This prevents the device from generating further interrupts
+until the bit is cleared. The userspace driver should clear this
+bit before blocking and waiting for more interrupts.
+   /para
+/sect1
+sect1 id=uio_pci_generic_userspace
+titleWriting userspace driver using uio_pci_generic/title
+   para
+Userspace driver can use pci sysfs interface, or the
+libpci libray that wraps it, to talk to the 

Re: [PATCH -tip -v12 01/11] x86: instruction decoder API

2009-07-16 Thread Masami Hiramatsu
Sam Ravnborg wrote:
 diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
 new file mode 100644
 index 000..01e079a
 --- /dev/null
 +++ b/arch/x86/include/asm/inat.h
 @@ -0,0 +1,125 @@
 +#ifndef _ASM_INAT_INAT_H
 +#define _ASM_INAT_INAT_H
 
 [With reference to comment on patch 2/12...]
 You create inat.h here.
 Could you investigave what is needed to factor out the stuff
 needed from userspace so we can avoid the ugly havk where
 you redefine types.h?

Sorry, I'm a bit confusing.
Would you mean that I should break down user_include.h and
add those redefined types in inat.h?

 Maybe create a inat_types.h + inat.h as we do in other cases?

And inat_types.h has two parts, one for kernel, and one for
userspace(which is moved from user_include.h), is that right?

Thank you,

 
 Same for the other files that requred the types.h hack.
 
   Sam
 

-- 
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: mhira...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -tip -v12 02/11] x86: x86 instruction decoder build-time selftest

2009-07-16 Thread Masami Hiramatsu
Sam Ravnborg wrote:
 On Thu, Jul 16, 2009 at 11:57:06AM -0400, Masami Hiramatsu wrote:
 Add a user-space selftest of x86 instruction decoder at kernel build time.
 When CONFIG_X86_DECODER_SELFTEST=y, Kbuild builds a test harness of x86
 instruction decoder and performs it after building vmlinux.
 The test compares the results of objdump and x86 instruction decoder
 code and check there are no differences.
 
 Long overdue review from my side...
 
  arch/x86/scripts/Makefile   |   19 +++
  arch/x86/scripts/distill.awk|   42 +
  arch/x86/scripts/test_get_len.c |   99 
 +++
  arch/x86/scripts/user_include.h |   49 +++
 
 Hmmm, we have two architectures that uses scripts/ and three that
 uses tools/.
 I prefer the latter name as what we have ere is beyound what
 I generally recognize as a script.
 
 we have scripts/ in top-level and we do not rename this
 as we have this hardcoded too many places - but no reason to
 use the wrong name here.
 
 diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
 index 01e079a..9090665 100644
 --- a/arch/x86/include/asm/inat.h
 +++ b/arch/x86/include/asm/inat.h
 @@ -20,7 +20,9 @@
   * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 
 USA.
   *
   */
 +#ifdef __KERNEL__
  #include linux/types.h
 +#endif
  
  /* Instruction attributes */
  typedef u32 insn_attr_t;
 
 Why this?
 If you need this to use this file from userspace then could we do some
 other trick to make this OK?



 
 I see it repeated several times below.
 [If this has already been discussed I have missed it - sorry].
 
 
 diff --git a/arch/x86/scripts/Makefile b/arch/x86/scripts/Makefile
 new file mode 100644
 index 000..f08859e
 --- /dev/null
 +++ b/arch/x86/scripts/Makefile
 @@ -0,0 +1,19 @@
 +PHONY += posttest
 +quiet_cmd_posttest = TEST$@
 +  cmd_posttest = objdump -d $(objtree)/vmlinux | awk -f 
 $(srctree)/arch/x86/scripts/distill.awk | $(obj)/test_get_len
 +
 
 You are using the native objdump here.
 But I assume this fails miserably when you build x86 on a powerpc host.
 In other words - you broke an allyesconfig build for -next...
 We have $(OBJDUMP) for this.

Ah, I see... Would you know actual name of x86-objdump on the powerpc
(or any other crosscompiling host)? I just set OBJDUMP=objdump is OK?
I'm not so sure about cross-compiling kernel...

 +posttest: $(obj)/test_get_len vmlinux
 +$(call cmd,posttest)
 +
 +test_get_len_SRC = $(srctree)/arch/x86/scripts/test_get_len.c 
 $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c
 +test_get_len_INC = $(srctree)/arch/x86/include/asm/inat.h 
 $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
 +
 +quiet_cmd_test_get_len = CC  $@
 +  cmd_test_get_len = $(CC) -Wall $(test_get_len_SRC) 
 -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include -include 
 $(srctree)/arch/x86/scripts/user_include.h -o $@
 
 Is there a specific reason why you cannot use the standard hostprogs-y for 
 this?
 It will take care of dependency tracking etc.
 What you have above is a hopeless incomplete list of dependencies.
 
 You need to use HOST_EXTRACFLAGS to set additional -I options and the 
 -include.

Thank you, I'll try to use hostprogs-y.

 +
 +static void usage()
 +{
 +fprintf(stderr, usage: %s  distilled_disassembly\n, prog);
 +exit(1);
 +}
 
 It would be nice to tell the user what the program is supposed to do.
 I know this is a bit unusual but no reason to copy bad practice.
 

Sure, maybe copying usage line in distill.awk is more helpful for user...

Thank you,

-- 
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: mhira...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Update qemu-kvm to allow a larger BIOS image.

2009-07-16 Thread Jordan Justen
On Wed, Jul 15, 2009 at 10:37 PM, Sheng Yangsh...@linux.intel.com wrote:
 Make sense to me. So what's mattered here is not bios, but qemu-kvm and kvm
 code. The user can replace bios binary by UEFI binary easily, but not with kvm
 related part.

 I realized you still need separate the qemu-kvm patch into two: one for bios
 and another for qemu.

Okay, I will split this change apart.

 And I just hope this modification won't break some old
 OS(any OS got assumption about bios size? I think Tiano team should have idea
 on it)...

I don't think the OS should care about the bios size, but it is
important to update the INT15-E820 memory ranges to help
make sure the OS knows which regions are in use.

Even without the E820 change, I think it would be unlikely
for an OS to try utilize these memory regions, but it is
definitely better to properly describe it.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] Move qemu-kvm 'VMC TSS Pages' to allow a larger BIOS image.

2009-07-16 Thread Jordan Justen
Move qemu-kvm 'VMC TSS Pages' from:
  0xfffbd000-0xfffb
to:
  0xfeffd000-0xfeff

The step is required to free up the 0xff00-0x (16MB) range
for use with bios.bin.

This change depends upon a change to kvm/bios/rombios.c so the bios
INT15-E820 function will properly reserve the new location.

Signed-off-by: Jordan Justen jordan.l.jus...@intel.com
---
 qemu-kvm-x86.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/qemu-kvm-x86.c b/qemu-kvm-x86.c
index daf60b6..b5306aa 100644
--- a/qemu-kvm-x86.c
+++ b/qemu-kvm-x86.c
@@ -63,7 +63,7 @@ static int kvm_init_tss(kvm_context_t kvm)
 * this address is 3 pages before the bios, and the bios should 
present
 * as unavaible memory
 */
-   r = kvm_set_tss_addr(kvm, 0xfffbd000);
+   r = kvm_set_tss_addr(kvm, 0xfeffd000);
if (r  0) {
fprintf(stderr, kvm_init_tss: unable to set tss 
addr\n);
return r;
-- 
1.6.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] Update qemu-kvm bios to allow for a larger bios image.

2009-07-16 Thread Jordan Justen
The bios will now reserve more memory via the E820 functions.

Previously we reserved:
  0xfffbc000-0xfffbcfff -   4KB - KVM kernel module, EPT identity pages tables
  0xfffbd000-0xfffb -  12KB - KVM bios, VMC TSS Pages
  0xfffc-0x - 256KB - Max KVM bios.bin (usually top 128KB is used)

Now we will reserve:
  0xfeffc000-0xfeffcfff -   4KB - KVM kernel module, EPT identity pages tables
  0xfeffd000-0xfeff -  12KB - KVM bios, VMC TSS Pages
  0xff00-0x -  16MB - Max KVM bios.bin

Signed-off-by: Jordan Justen jordan.l.jus...@intel.com
---
 kvm/bios/rombios.c |8 
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kvm/bios/rombios.c b/kvm/bios/rombios.c
index 6186199..2d0c153 100644
--- a/kvm/bios/rombios.c
+++ b/kvm/bios/rombios.c
@@ -4596,14 +4596,14 @@ ASM_END
 case 5:
 /* 4 pages before the bios, 3 pages for vmx tss pages,
 * the other page for EPT real mode pagetable */
-set_e820_range(ES, regs.u.r16.di, 0xfffbc000L,
-   0xfffcL, 0, 0, 2);
+set_e820_range(ES, regs.u.r16.di, 0xfeffc000L,
+   0xff00L, 0, 0, 2);
 regs.u.r32.ebx = 6;
 break;
 case 6:
-/* 256KB BIOS area at the end of 4 GB */
+/* 16MB BIOS area at the end of 4 GB */
 set_e820_range(ES, regs.u.r16.di,
-   0xfffcL, 0xL ,0, 0, 2);
+   0xff00L, 0xL ,0, 0, 2);
 if (extra_highbits_memory_size || 
extra_lowbits_memory_size)
 regs.u.r32.ebx = 7;
 else
-- 
1.6.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] Update VMX_EPT_IDENTITY_PAGETABLE_ADDR to synchronize with kernel code.

2009-07-16 Thread Jordan Justen
Although VMX_EPT_IDENTITY_PAGETABLE_ADDR does not appear to be used within
qemu-kvm, this change mirrors a similar change in the kernel kvm code.

The purpose is to move the KVM 'EPT Identity Pages' from:
  0xfffbc000-0xfffbcfff
to:
  0xfeffc000-0xfeffcfff

The step is required to free up the 0xff00-0x (16MB) range
for use with bios.bin.

The KVM kernel change depends upon a change to kvm/bios/rombios.c so the bios
INT15-E820 function will properly reserve the new location.

Signed-off-by: Jordan Justen jordan.l.jus...@intel.com
---
 kvm/include/x86/asm/vmx.h |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kvm/include/x86/asm/vmx.h b/kvm/include/x86/asm/vmx.h
index df8d4f9..99e2bb9 100644
--- a/kvm/include/x86/asm/vmx.h
+++ b/kvm/include/x86/asm/vmx.h
@@ -403,7 +403,7 @@ enum vmcs_field {
 #define VMX_EPT_EXECUTABLE_MASK0x4ull
 #define VMX_EPT_IGMT_BIT   (1ull  6)
 
-#define VMX_EPT_IDENTITY_PAGETABLE_ADDR0xfffbc000ul
+#define VMX_EPT_IDENTITY_PAGETABLE_ADDR0xfeffc000ul
 
 
 #define ASM_VMX_VMCLEAR_RAX   .byte 0x66, 0x0f, 0xc7, 0x30
-- 
1.6.0.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/10] Move IO APIC to its own lock.

2009-07-16 Thread Marcelo Tosatti
On Thu, Jul 16, 2009 at 10:51:16AM +0300, Gleb Natapov wrote:
 On Wed, Jul 15, 2009 at 06:38:48PM -0300, Marcelo Tosatti wrote:
  On Wed, Jul 15, 2009 at 11:48:17PM +0300, Gleb Natapov wrote:
 + spin_unlock(ioapic-lock);
 + kvm_notify_acked_irq(ioapic-kvm, KVM_IRQCHIP_IOAPIC, 
 i);
 + spin_lock(ioapic-lock);

While traversing the IOAPIC pins you drop the lock and acquire it again,
which is error prone: what if the redirection table is changed in
between, for example?

   Yes, the ack notifiers is a problematic part. The only thing that
   current ack notifiers do is set_irq_level(0) so this is not the problem
   in practice. I'll see if I can eliminate this dropping of the lock here.
  
  Currently, yes. But take into account that an ack notifier might do 
  other things than set_irq_level(0).
  
 For instance? Ack notifier can do whatever it wants if it does not call
 back into ioapic. It may call to ioapic only by set_irq_level(0) (which
 it does now from device assignment code) or by set_irq_level(1) and this
 does not make sense for level triggered interrupts because not calling
 set_irq_level(1) will have the same effect.

Checking whether a given gsi is masked on the
PIC/IOAPIC from the PIC/IOAPIC mask notifiers
(http://www.spinics.net/lists/kvm/msg19093.html), for example. 

Do you think that particular case should be handled in a different way?
(it could check whether a GSI is masked or not on both PIC/IOAPIC/LAPIC
from a different context other than mask notifier).

 But I think I found the way to not drop the lock here.

See, this is adding more complexity than simplifying things (the
recursive check).

  Say for example an ack notifier that takes the PIC or IOAPIC lock 
  (for whatever reason).
 What reason? No one should take PIC or IOAPIC lock outside of PIC or
 IOAPIC code. This is layering violation. IOAPIC should be accessed only
 through its public interface (set_irq/mmio_read|write/update_eoi)

See link above.

  
Also, irq_lock was held during ack and mask notifier callbacks, so they
(the callbacks) did not have to worry about concurrency.

   Is it possible to have more then one ack for the same interrupt line at
   the same time?
  
  Why not? But maybe this is a somewhat stupid point, because you can
  require the notifiers to handle that.
 If this is possible (and it looks like it may happen if IOAPIC broadcasts
 an interrupt vector to more then one vcpu) then it may be better to push
 complexity into an ack notifier to not penalize a common scenario.
 But again, if we will not drop the lock around ack notifier call they
 will be serialized.
 
  
You questioned the purpose of irq_lock earlier, and thats what it is:
synchronization between pic and ioapic blur at the irq notifiers.

   Pic has its own locking and it fails miserably in regards ack notifiers.
   I bet nobody tried device assignment with pic. It will not work.
  
  It fails to take irq_lock on automatic EOI because on guest entry 
  interrupts are disabled (see d5ecfdd25c412df9c0ecf3ab8e066461fd4f69df).
 This explains why the code is buggy, but does not fix the bug. There are
 two bugs at least with the pic + ack notifiers. The first one is that
 irq notifiers are called without irq_lock held and that will trigger
 WARN_ON(!mutex_is_locked(kvm-irq_lock)) in kvm_set_irq() in device
 assignment case (besides what is the point of a lock if it is not taken
 on every code path?).

This could be fixed by switching irq_lock to a spinlock.

  Another one is that you can't postpone call to ack notifiers in
 kvm_pic_read_irq() till after pic_update_irq(s). The reason is that
 pic_update_irq() will trigger acked interrupt immediately since ack
 notifier is the one who lowers irq line in device assignment case
 (that is the reason I haven't done the same in ioapic case BTW).

I'm not sure i understand this one, can you be more verbose please?

   irq_lock is indeed used to synchronization ioapic access, but the way
   it is done requires calling kvm_set_irq() with lock held and this adds
   unnecessary lock on a msi irq injection path.
   
Using RCU to synchronize ack/mask notifier list walk versus list
addition is good, but i'm not entirely sure that smaller locks like you
are proposing is a benefit long term (things become much more tricky).
   Without removing irq_lock RCU is useless since the list walking is always
   done with irq_lock held. I see smaller locks as simplification BTW. The
   thing is tricky now, or so I felt while trying to understand what
   irq_lock actually protects. With smaller locks with names that clearly
   indicates which data structure it protects I feel the code is much
   easy to understand.
  
  What is worrying is if you need to take both PIC and IOAPIC locks for 
  some operation (then have to worry about lock ordering etc).
  
 Lets not worry about far fetched theoretical cases especially 

Re: [PATCH 3/3] Update VMX_EPT_IDENTITY_PAGETABLE_ADDR to synchronize with kernel code.

2009-07-16 Thread Marcelo Tosatti
On Thu, Jul 16, 2009 at 11:02:22AM -0700, Jordan Justen wrote:
 Although VMX_EPT_IDENTITY_PAGETABLE_ADDR does not appear to be used within
 qemu-kvm, this change mirrors a similar change in the kernel kvm code.
 
 The purpose is to move the KVM 'EPT Identity Pages' from:
   0xfffbc000-0xfffbcfff
 to:
   0xfeffc000-0xfeffcfff
 
 The step is required to free up the 0xff00-0x (16MB) range
 for use with bios.bin.
 
 The KVM kernel change depends upon a change to kvm/bios/rombios.c so the bios
 INT15-E820 function will properly reserve the new location.
 
 Signed-off-by: Jordan Justen jordan.l.jus...@intel.com
 ---
  kvm/include/x86/asm/vmx.h |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)
 
 diff --git a/kvm/include/x86/asm/vmx.h b/kvm/include/x86/asm/vmx.h
 index df8d4f9..99e2bb9 100644
 --- a/kvm/include/x86/asm/vmx.h
 +++ b/kvm/include/x86/asm/vmx.h
 @@ -403,7 +403,7 @@ enum vmcs_field {
  #define VMX_EPT_EXECUTABLE_MASK  0x4ull
  #define VMX_EPT_IGMT_BIT (1ull  6)
  
 -#define VMX_EPT_IDENTITY_PAGETABLE_ADDR  0xfffbc000ul
 +#define VMX_EPT_IDENTITY_PAGETABLE_ADDR  0xfeffc000ul

Won't this conflict with an older BIOS? (the e820 reserved entry on
older qemu-kvm+bios will not cover the EPT identity table on kernels
with this patch).

Perhaps add a new ioctl (similar to the tss one) to so userspace can set
the address?

  
  
  #define ASM_VMX_VMCLEAR_RAX   .byte 0x66, 0x0f, 0xc7, 0x30
 -- 
 1.6.0.4
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv4] uio: add generic driver for PCI 2.3 devices

2009-07-16 Thread Greg KH
On Thu, Jul 16, 2009 at 08:03:46PM +0300, Michael S. Tsirkin wrote:
 On Thu, Jul 16, 2009 at 08:52:08AM -0700, Greg KH wrote:
  On Thu, Jul 16, 2009 at 05:12:55PM +0200, Hans J. Koch wrote:
   On Thu, Jul 16, 2009 at 05:07:10PM +0300, Michael S. Tsirkin wrote:
On Wed, Jul 15, 2009 at 03:08:29PM -0700, Greg KH wrote:
 How about moving that documentation into a place that people will 
 notice
 it, like the rest of the UIO documentation?

Greg,

would it make more sense to add this to
Documentation/DocBook/uio-howto.xml, or to create
Documentation/uio_pci_generic.txt ?
   
   Hi Michael,
   I'd prefer to have it in uio-howto.xml so that people only have to look in
   one place. In does not have to be very detailled, just a short explanation
   what this driver is all about and a short example how to use it.
  
  I agree, that would be the best place for it.
  
  thanks,
  
  greg k-h
 
 OK. Could you take a look at the following please?

Looks good to me.

Want to resend both of the patches so I can apply them to my tree?

thanks,

greg k-h
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv4] uio: add generic driver for PCI 2.3 devices

2009-07-16 Thread Greg KH
On Thu, Jul 16, 2009 at 03:31:01PM +0300, Michael S. Tsirkin wrote:
 On Wed, Jul 15, 2009 at 03:08:29PM -0700, Greg KH wrote:
  On Wed, Jul 15, 2009 at 11:13:40PM +0300, Michael S. Tsirkin wrote:
   This adds a generic uio driver that can bind to any PCI device.  First
   user will be virtualization where a qemu userspace process needs to give
   guest OS access to the device.
   
   Interrupts are handled using the Interrupt Disable bit in the PCI command
   register and Interrupt Status bit in the PCI status register.  All devices
   compliant to PCI 2.3 (circa 2002) and all compliant PCI Express devices 
   should
   support these bits.  Driver detects this support, and won't bind to 
   devices
   which do not support the Interrupt Disable Bit in the command register.
   
   It's expected that more features of interest to virtualization will be
   added to this driver in the future. Possibilities are: mmap for device
   resources, MSI/MSI-X, eventfd (to interface with kvm), iommu.
   
   Acked-by: Chris Wright chr...@redhat.com
   Signed-off-by: Michael S. Tsirkin m...@redhat.com
   ---
   
   Hans, Greg, please review and consider for upstream.
   
   This is intended to solve the problem in virtualization that shared
   interrupts do not work with assigned devices. Earlier versions of this
   patch have circulated on k...@vger.
  
  How does this play with the pci-stub driver that I thought was written
  to solve this very problem?
 
 
 AFAIK the problem pci stub was written to solve is simply to bind to a
 device. You then have to use another kernel module which looks the
 device up with something like pci_get_bus_and_slot to do anything
 useful. In particular, for non-shared interrupts, we can disable the
 interrupt in the apic. But this does not work well for shared
 interrupts. Thus this work.
 
 The uio driver will be used in virtualization scenarious, a couple
 of possible ones that have been mentioned on the kvm list are:
 - device assignment (guest access to device) for simple devices with
   shared interrupts: emulating PCI is tricky enough to better be done in
   userspace. shared interrupt support is important as it happens
   with real devices
 - simple communication between guest and host:
   we create a virtual device in host, and userspace
   driver in guest gets events and passes them on
   to e.g. dbus. shared interrupt support is important
   to avoid wasting irqs

Ah, ok, thanks for all of the explanation, that makes sense.

greg k-h
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH] KVM: introduce xinterface API for external interaction with guests

2009-07-16 Thread Gregory Haskins
Arnd Bergmann wrote:
 On Thursday 16 July 2009, Gregory Haskins wrote:
   
 Background: The original vbus code was tightly integrated with kvm.ko.  Avi
 suggested that we abstract the interfaces such that it could live outside
 of kvm.
 

 The code is still highly kvm-specific, you would not be able to use
 it with another hypervisor like lguest or vmware player, right?
   

In its current form, it is kvm specific primarily because it was not a
explicit design constraint of mine to support others.  However, there is
no reason why we could not generalize the interface if that is a
desirable trait.  Ultimately I would like to have my project support
other things like lguest, so this is not a bad idea.  Otherwise, someone
will invariably be proposing an lguest_xinterface next ;)

   
 Example usage: QEMU instantiates a guest, and an external module foo
 that desires the ability to interface with the guest (say via
 open(/dev/foo)).  QEMU may then issue a KVM_GET_VMID operation to acquire
 the u64-based vmid, and pass it to ioctl(foofd, FOO_SET_VMID, vmid).
 Upon receipt, the foo module can issue kvm_xinterface_find(vmid) to acquire
 the proper context.  Internally, the struct kvm* and associated
 struct module* will remain pinned at least until the foo module calls
 kvm_xinterface_put().
 

 Your approach allows passing the vmid from a process that does
 not own the kvm context. This looks like an intentional feature,
 but I can't see what this gains us.

This work is towards the implementation of lockless-shared-memory
subsystems, which includes ring constructs such as virtio-ring,
VJ-netchannels, and vbus-ioq.   I find that these designs perform
optimally when you allow two distinct contexts (producer + consumer) to
process the ring concurrently, which implies a disparate context from
the guest in question.  Note that the infrastructure we are discussing
does not impose a requirement for the contexts to be unique: it will
work equally well from the same or a different process.

For an example of this producer/consumer dynamic over shared memory in
action, please refer to my previous posting re: vbus

http://lkml.org/lkml/2009/4/21/408

I am working on v4 now, and this patch is part of the required support.

  

   
 As a final measure, we link the xinterface code statically
 into the kernel so that callers are guaranteed a stable interface to
 kvm_xinterface_find() without implicitly pinning kvm.ko or racing against
 it.
 

 I also don't understand this. Are you worried about driver modules
 breaking when an externally-compiled kvm.ko is loaded? The same could
 be achieved by defining your data structures kvm_xinterface_ops and
 kvm_xinterface in a kernel header that is not shipped by kvm-kmod but
 always taken from the kernel headers.
 It does not matter if the entry points are build into the kernel or
 exported from a kvm.ko as long as you define a fixed ABI.

 What is the problem with pinning kvm.ko from another module using
 its features?
   

Well, there is always the chance that I am doing something dumb or
missing the point ;)

But my rationale was as follows:  The problem is that kvm is a little
weird in the module ref department: If I were to do it the standard way
and link xinterface.o into kvm.o (and have any xinterface_find() users
do a tristate+depends on KVM), this would work as I believe you are
suggesting.  That is to say: whenever I loaded foo.ko, insmod would
automatically up the reference of kvm.ko.  The issue is that is not
quite what I really want ;)

I want to hold the reference to the entire .text chain, which includes
kvm.ko + [kvm-intel.ko | kvm-amd.ko].  If you look carefully, the
ops-owner that is assigned is actually the arch.ko.  In addition, I
wanted the kvm.ko lifetime to be associated with the lifetime of its
contexts actually in use, not the lifetime of its installed
dependencies.  Therefore, I did it this way.

But to your point, I suppose the dependency lifetime thing is not a huge
deal.  I could therefore modify the patch to simply link xinterface.o
into kvm.ko and still achieve the primary objective by retaining ops-owner.

Note that if we are going to generalize the interface to support other
guests as you may have been suggesting above, it should probably stay
statically linked (and perhaps live in ./lib or something)


 Can't you simply provide a function call to lookup the kvm context
 pointer from the file descriptor to achieve the same functionality?
   
You mean so have: struct kvm_xinterface *kvm_xinterface_find(int fd)

(instead of creating our own vmid namespace) ?

Or are you suggesting using fget() instead of kvm_xinterface_find()?

 To take that thought further, maybe the dependency can be turned
 around: If every user (pci-uio, virtio-net, ...) exposes a file
 descriptor based interface to user space, you can have a kvm
 ioctl to register the object behind that file descriptor with
 an existing kvm context to associate it with a guest.

FWIW: 

Re: [PATCH 3/3] Update VMX_EPT_IDENTITY_PAGETABLE_ADDR to synchronize with kernel code.

2009-07-16 Thread Jordan Justen
On Thu, 2009-07-16 at 11:18 -0700, Marcelo Tosatti wrote:
 On Thu, Jul 16, 2009 at 11:02:22AM -0700, Jordan Justen wrote:
  Although VMX_EPT_IDENTITY_PAGETABLE_ADDR does not appear to be used within
  qemu-kvm, this change mirrors a similar change in the kernel kvm code.
  
  The purpose is to move the KVM 'EPT Identity Pages' from:
0xfffbc000-0xfffbcfff
  to:
0xfeffc000-0xfeffcfff
  
  The step is required to free up the 0xff00-0x (16MB) range
  for use with bios.bin.
  
  The KVM kernel change depends upon a change to kvm/bios/rombios.c so the 
  bios
  INT15-E820 function will properly reserve the new location.
  
  Signed-off-by: Jordan Justen jordan.l.jus...@intel.com
  ---
   kvm/include/x86/asm/vmx.h |2 +-
   1 files changed, 1 insertions(+), 1 deletions(-)
  
  diff --git a/kvm/include/x86/asm/vmx.h b/kvm/include/x86/asm/vmx.h
  index df8d4f9..99e2bb9 100644
  --- a/kvm/include/x86/asm/vmx.h
  +++ b/kvm/include/x86/asm/vmx.h
  @@ -403,7 +403,7 @@ enum vmcs_field {
   #define VMX_EPT_EXECUTABLE_MASK0x4ull
   #define VMX_EPT_IGMT_BIT   (1ull  6)
   
  -#define VMX_EPT_IDENTITY_PAGETABLE_ADDR0xfffbc000ul
  +#define VMX_EPT_IDENTITY_PAGETABLE_ADDR0xfeffc000ul
 
 Won't this conflict with an older BIOS? (the e820 reserved entry on
 older qemu-kvm+bios will not cover the EPT identity table on kernels
 with this patch).
 
 Perhaps add a new ioctl (similar to the tss one) to so userspace can set
 the address?

I am not very familiar with the reasons why the EPT identity
page-table setup is happening within the kernel.

As it stands, there is the shared knowledge that the qemu-kvm bios
just happens to know that the kvm kernel code has reserved a
particular page of the address space.

It would be much easier to coordinate all the pieces if it were
all setup on the qemu-kvm side.

Is this possible?

   
   
   #define ASM_VMX_VMCLEAR_RAX   .byte 0x66, 0x0f, 0xc7, 0x30
  -- 
  1.6.0.4
  
  --
  To unsubscribe from this list: send the line unsubscribe kvm in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH] KVM: introduce xinterface API for external interaction with guests

2009-07-16 Thread Gregory Haskins
Gregory Haskins wrote:
 Note that if we are going to generalize the interface to support other
 guests as you may have been suggesting above, it should probably stay
 statically linked (and perhaps live in ./lib or something)
   

More specifically, it can no longer live in kvm.ko.  I guess it
technically doesn't have to be statically linked, though (e.g.
xinterface.ko is fine, too).

Regards,
-Greg



signature.asc
Description: OpenPGP digital signature


Re: [PATCH -tip -v12 02/11] x86: x86 instruction decoder build-time selftest

2009-07-16 Thread Masami Hiramatsu
Masami Hiramatsu wrote:
 You are using the native objdump here.
 But I assume this fails miserably when you build x86 on a powerpc host.
 In other words - you broke an allyesconfig build for -next...
 We have $(OBJDUMP) for this.
 
 Ah, I see... Would you know actual name of x86-objdump on the powerpc
 (or any other crosscompiling host)? I just set OBJDUMP=objdump is OK?
 I'm not so sure about cross-compiling kernel...

Oops, we already have it. Yes, I'll use $(OBJDUMP).


-- 
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: mhira...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH] KVM: introduce xinterface API for external interaction with guests

2009-07-16 Thread Arnd Bergmann
On Thursday 16 July 2009, Gregory Haskins wrote:
 Arnd Bergmann wrote:  

  Your approach allows passing the vmid from a process that does
  not own the kvm context. This looks like an intentional feature,
  but I can't see what this gains us.
 
 This work is towards the implementation of lockless-shared-memory
 subsystems, which includes ring constructs such as virtio-ring,
 VJ-netchannels, and vbus-ioq.   I find that these designs perform
 optimally when you allow two distinct contexts (producer + consumer) to
 process the ring concurrently, which implies a disparate context from
 the guest in question.  Note that the infrastructure we are discussing
 does not impose a requirement for the contexts to be unique: it will
 work equally well from the same or a different process.
 
 For an example of this producer/consumer dynamic over shared memory in
 action, please refer to my previous posting re: vbus
 
 http://lkml.org/lkml/2009/4/21/408
 
 I am working on v4 now, and this patch is part of the required support.

Ok. I can see how your approach gives you more flexibility in this
regard, but it does not seem critical.

 But to your point, I suppose the dependency lifetime thing is not a huge
 deal.  I could therefore modify the patch to simply link xinterface.o
 into kvm.ko and still achieve the primary objective by retaining ops-owner.

Right. And even if it's a separate module, holding an extra reference
on kvm.ko will not cause any harm.

  Can't you simply provide a function call to lookup the kvm context
  pointer from the file descriptor to achieve the same functionality?

 You mean so have: struct kvm_xinterface *kvm_xinterface_find(int fd)
 
 (instead of creating our own vmid namespace) ?
 
 Or are you suggesting using fget() instead of kvm_xinterface_find()?

I guess they are roughly equivalent. Either you pass a fd to
kvm_xinterface_find, or pass the struct file pointer you get
from fget. The latter is probably more convenient because it
allows you to pass around the struct file in kernel contexts
that don't have that file descriptor open.

  To take that thought further, maybe the dependency can be turned
  around: If every user (pci-uio, virtio-net, ...) exposes a file
  descriptor based interface to user space, you can have a kvm
  ioctl to register the object behind that file descriptor with
  an existing kvm context to associate it with a guest.
 
 FWIW: We do that already for the signaling path (see irqfd and ioeventfd
 in kvm.git).  Each side exposes interfaces that accept eventfds, and the
 fds are passed around that way.
 
 However, for the functions we are talking about now, I don't think it
 really works well to go the other way.  I could be misunderstanding what
 you mean, though.  What I mean is that it's KVM that is providing a
 service to the other modules (in this case, translating memory
 pointers), so what would an inverse interface look like for that?  And
 even if you came up with one, it seems to me that its just 6 of one,
 half-dozen of the other kind of thing.

I mean something like 

int kvm_ioctl_register_service(struct file *filp, unsigned long arg)
{
struct file *service = fget(arg);
struct kvm *kvm = filp-private_data;

if (!service-f_ops-new_xinterface_register)
return -EINVAL;

return service-f_ops-new_xinterface_register(service, (void*)kvm,
kvm_xinterface_ops);
}

This would assume that we define a new file_operation specifically for this,
which would simplify the code, but there are other ways to achieve the same.
It would even mean that you don't need any static code as an interface layer.

Arnd 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] Update VMX_EPT_IDENTITY_PAGETABLE_ADDR to synchronize with kernel code.

2009-07-16 Thread Marcelo Tosatti
On Thu, Jul 16, 2009 at 11:48:46AM -0700, Jordan Justen wrote:
 On Thu, 2009-07-16 at 11:18 -0700, Marcelo Tosatti wrote:
  On Thu, Jul 16, 2009 at 11:02:22AM -0700, Jordan Justen wrote:
   Although VMX_EPT_IDENTITY_PAGETABLE_ADDR does not appear to be used within
   qemu-kvm, this change mirrors a similar change in the kernel kvm code.
   
   The purpose is to move the KVM 'EPT Identity Pages' from:
 0xfffbc000-0xfffbcfff
   to:
 0xfeffc000-0xfeffcfff
   
   The step is required to free up the 0xff00-0x (16MB) range
   for use with bios.bin.
   
   The KVM kernel change depends upon a change to kvm/bios/rombios.c so the 
   bios
   INT15-E820 function will properly reserve the new location.
   
   Signed-off-by: Jordan Justen jordan.l.jus...@intel.com
   ---
kvm/include/x86/asm/vmx.h |2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
   
   diff --git a/kvm/include/x86/asm/vmx.h b/kvm/include/x86/asm/vmx.h
   index df8d4f9..99e2bb9 100644
   --- a/kvm/include/x86/asm/vmx.h
   +++ b/kvm/include/x86/asm/vmx.h
   @@ -403,7 +403,7 @@ enum vmcs_field {
#define VMX_EPT_EXECUTABLE_MASK  0x4ull
#define VMX_EPT_IGMT_BIT (1ull  6)

   -#define VMX_EPT_IDENTITY_PAGETABLE_ADDR  0xfffbc000ul
   +#define VMX_EPT_IDENTITY_PAGETABLE_ADDR  0xfeffc000ul
  
  Won't this conflict with an older BIOS? (the e820 reserved entry on
  older qemu-kvm+bios will not cover the EPT identity table on kernels
  with this patch).
  
  Perhaps add a new ioctl (similar to the tss one) to so userspace can set
  the address?
 
 I am not very familiar with the reasons why the EPT identity
 page-table setup is happening within the kernel.
 
 As it stands, there is the shared knowledge that the qemu-kvm bios
 just happens to know that the kvm kernel code has reserved a
 particular page of the address space.
 
 It would be much easier to coordinate all the pieces if it were
 all setup on the qemu-kvm side.
 
 Is this possible?

It is possible but all of the EPT implementation is in the kernel, so it
does not make much sense to have the details of the identity table in
qemu-kvm.

The address of it though can be controlled by qemu-kvm.

Sheng?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH] KVM: introduce xinterface API for external interaction with guests

2009-07-16 Thread Gregory Haskins
Zan Lynx wrote:
 Gregory Haskins wrote:
 Gregory Haskins wrote:
 Note that if we are going to generalize the interface to support other
 guests as you may have been suggesting above, it should probably stay
 statically linked (and perhaps live in ./lib or something)
   

 More specifically, it can no longer live in kvm.ko.  I guess it
 technically doesn't have to be statically linked, though (e.g.
 xinterface.ko is fine, too).

 Speaking as someone who knows nothing about this, if this is going to
 be a module and visible in places like lsmod, could you name it
 something else?

 I see xinterface.ko and the first thing in my head is something to
 do with the X Window System.

 How about kvm_xinterface or vguest_xinterface?


Heh, I totally agree.  That was just pseudo-naming, anyway. ;)

If it was going to be generic, I would do something like
virt-xinterface.ko.

Kind Regards,
-Greg



signature.asc
Description: OpenPGP digital signature


Re: [PATCH -tip -v12 02/11] x86: x86 instruction decoder build-time selftest

2009-07-16 Thread Sam Ravnborg
  +  cmd_posttest = objdump -d $(objtree)/vmlinux | awk -f 
  $(srctree)/arch/x86/scripts/distill.awk | $(obj)/test_get_len
  +
  
  You are using the native objdump here.
  But I assume this fails miserably when you build x86 on a powerpc host.
  In other words - you broke an allyesconfig build for -next...
  We have $(OBJDUMP) for this.
 
 Ah, I see... Would you know actual name of x86-objdump on the powerpc
 (or any other crosscompiling host)? I just set OBJDUMP=objdump is OK?
 I'm not so sure about cross-compiling kernel...

Replacing objdump with $(OBJDUMP) will do the trick.
We set OBJDUMP to the correct value in the top-level makefile.

Are there any parts of your user-space program that rely
on the host is little-endian?
If it does then it would fail on a power-pc target despite using the
correct objdump.

Sam
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -tip -v12 01/11] x86: instruction decoder API

2009-07-16 Thread Sam Ravnborg
On Thu, Jul 16, 2009 at 01:28:54PM -0400, Masami Hiramatsu wrote:
 Sam Ravnborg wrote:
  diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
  new file mode 100644
  index 000..01e079a
  --- /dev/null
  +++ b/arch/x86/include/asm/inat.h
  @@ -0,0 +1,125 @@
  +#ifndef _ASM_INAT_INAT_H
  +#define _ASM_INAT_INAT_H
  
  [With reference to comment on patch 2/12...]
  You create inat.h here.
  Could you investigave what is needed to factor out the stuff
  needed from userspace so we can avoid the ugly havk where
  you redefine types.h?
 
 Sorry, I'm a bit confusing.
 Would you mean that I should break down user_include.h and
 add those redefined types in inat.h?
No - try to factor out what is needed for your program
so you can avoid user_include.h entirely.
 
  Maybe create a inat_types.h + inat.h as we do in other cases?
 
 And inat_types.h has two parts, one for kernel, and one for
 userspace(which is moved from user_include.h), is that right?
More like inat_types.h include pure definitions and inat.h
define all the macros (that would be much nicer if expressed
as static inlines).

The real thing to consider is what is needed from your userspace
program and is also required by the kernel.
I did not event remotely try to find out - as I guess you know it.
So try to isolate these bits somehow and you have then nicely dropped
a lot of dependencies on the remainign headers and can thus
hopefully get rid of the ugly usser_include.h hack.

Sam
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH] KVM: introduce xinterface API for external interaction with guests

2009-07-16 Thread Zan Lynx

Gregory Haskins wrote:

Gregory Haskins wrote:

Note that if we are going to generalize the interface to support other
guests as you may have been suggesting above, it should probably stay
statically linked (and perhaps live in ./lib or something)
  


More specifically, it can no longer live in kvm.ko.  I guess it
technically doesn't have to be statically linked, though (e.g.
xinterface.ko is fine, too).


Speaking as someone who knows nothing about this, if this is going to be 
a module and visible in places like lsmod, could you name it something else?


I see xinterface.ko and the first thing in my head is something to do 
with the X Window System.


How about kvm_xinterface or vguest_xinterface?

--
Zan Lynx
zl...@acm.org

Knowledge is Power.  Power Corrupts.  Study Hard.  Be Evil.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -tip -v12 02/11] x86: x86 instruction decoder build-time selftest

2009-07-16 Thread Masami Hiramatsu
Sam Ravnborg wrote:
 +  cmd_posttest = objdump -d $(objtree)/vmlinux | awk -f 
 $(srctree)/arch/x86/scripts/distill.awk | $(obj)/test_get_len
 +
 You are using the native objdump here.
 But I assume this fails miserably when you build x86 on a powerpc host.
 In other words - you broke an allyesconfig build for -next...
 We have $(OBJDUMP) for this.
 Ah, I see... Would you know actual name of x86-objdump on the powerpc
 (or any other crosscompiling host)? I just set OBJDUMP=objdump is OK?
 I'm not so sure about cross-compiling kernel...
 
 Replacing objdump with $(OBJDUMP) will do the trick.
 We set OBJDUMP to the correct value in the top-level makefile.
 
 Are there any parts of your user-space program that rely
 on the host is little-endian?
 If it does then it would fail on a power-pc target despite using the
 correct objdump.

Hmm, as far as I can see, the result of get_next() macro with the types
more than two bytes(s16, s32...) might be effected.
But it doesn't effect get_insn_len test because those values are ignored.

Thank you,

-- 
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: mhira...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH -tip -v12 01/11] x86: instruction decoder API

2009-07-16 Thread Masami Hiramatsu
Sam Ravnborg wrote:
 On Thu, Jul 16, 2009 at 01:28:54PM -0400, Masami Hiramatsu wrote:
 Sam Ravnborg wrote:
 diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
 new file mode 100644
 index 000..01e079a
 --- /dev/null
 +++ b/arch/x86/include/asm/inat.h
 @@ -0,0 +1,125 @@
 +#ifndef _ASM_INAT_INAT_H
 +#define _ASM_INAT_INAT_H
 [With reference to comment on patch 2/12...]
 You create inat.h here.
 Could you investigave what is needed to factor out the stuff
 needed from userspace so we can avoid the ugly havk where
 you redefine types.h?
 Sorry, I'm a bit confusing.
 Would you mean that I should break down user_include.h and
 add those redefined types in inat.h?
 No - try to factor out what is needed for your program
 so you can avoid user_include.h entirely.
 Maybe create a inat_types.h + inat.h as we do in other cases?
 And inat_types.h has two parts, one for kernel, and one for
 userspace(which is moved from user_include.h), is that right?
 More like inat_types.h include pure definitions and inat.h
 define all the macros (that would be much nicer if expressed
 as static inlines).

OK, some macros still need to be macros, because it will be used
for defining static tables.

 The real thing to consider is what is needed from your userspace
 program and is also required by the kernel.
 I did not event remotely try to find out - as I guess you know it.
 So try to isolate these bits somehow and you have then nicely dropped
 a lot of dependencies on the remainign headers and can thus
 hopefully get rid of the ugly usser_include.h hack.

OK, I'll try to remove user_include.h hack.

Thank you so much!


-- 
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division

e-mail: mhira...@redhat.com

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >