[PATCH] kvm: testsuite: test EFER.NXE

2009-03-31 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Accesses with the NX bit set in a pte or pde behave differently when EFER.NXE
is set or unset.  Test that.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/user/test/x86/access.c b/user/test/x86/access.c
index 59a5756..272a4ef 100644
--- a/user/test/x86/access.c
+++ b/user/test/x86/access.c
@@ -60,7 +60,7 @@ enum {
 AC_ACCESS_TWICE,
 // AC_ACCESS_PTE,
 
-// AC_CPU_EFER_NX,
+AC_CPU_EFER_NX,
 AC_CPU_CR0_WP,
 
 NR_AC_FLAGS
@@ -86,6 +86,7 @@ const char *ac_names[] = {
 [AC_ACCESS_USER] = user,
 [AC_ACCESS_FETCH] = fetch,
 [AC_ACCESS_TWICE] = twice,
+[AC_CPU_EFER_NX] = efer.nx,
 [AC_CPU_CR0_WP] = cr0.wp,
 };
 
@@ -367,10 +368,12 @@ void ac_test_setup_pte(ac_test_t *at)
 at-expected_error = PFERR_PRESENT_MASK;
 
 pde_valid = at-flags[AC_PDE_PRESENT]
- !at-flags[AC_PDE_BIT51];
+ !at-flags[AC_PDE_BIT51]
+ !(at-flags[AC_PDE_NX]  !at-flags[AC_CPU_EFER_NX]);
 pte_valid = pde_valid
  at-flags[AC_PTE_PRESENT]
- !at-flags[AC_PTE_BIT51];
+ !at-flags[AC_PTE_BIT51]
+ !(at-flags[AC_PTE_NX]  !at-flags[AC_CPU_EFER_NX]);
 if (at-flags[AC_ACCESS_TWICE]) {
if (pde_valid) {
at-expected_pde |= PT_ACCESSED_MASK;
@@ -463,6 +466,7 @@ int ac_test_do_access(ac_test_t *at)
 
 unsigned r = unique;
 set_cr0_wp(at-flags[AC_CPU_CR0_WP]);
+set_efer_nx(at-flags[AC_CPU_EFER_NX]);
 
 if (at-flags[AC_ACCESS_TWICE]) {
asm volatile (
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm: testsuite: test reserved bits in mmu access tests

2009-03-31 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

test that reserved bits in pdes and ptes cause faults and that the
expected error code is issued.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/user/test/x86/access.c b/user/test/x86/access.c
index 49f74b3..59a5756 100644
--- a/user/test/x86/access.c
+++ b/user/test/x86/access.c
@@ -43,6 +43,7 @@ enum {
 AC_PTE_ACCESSED,
 AC_PTE_DIRTY,
 AC_PTE_NX,
+AC_PTE_BIT51,
 
 AC_PDE_PRESENT,
 AC_PDE_WRITABLE,
@@ -51,6 +52,7 @@ enum {
 AC_PDE_DIRTY,
 AC_PDE_PSE,
 AC_PDE_NX,
+AC_PDE_BIT51,
 
 AC_ACCESS_USER,
 AC_ACCESS_WRITE,
@@ -71,6 +73,7 @@ const char *ac_names[] = {
 [AC_PTE_USER] = pte.user,
 [AC_PTE_DIRTY] = pte.d,
 [AC_PTE_NX] = pte.nx,
+[AC_PTE_BIT51] = pte.51,
 [AC_PDE_PRESENT] = pde.p,
 [AC_PDE_ACCESSED] = pde.a,
 [AC_PDE_WRITABLE] = pde.rw,
@@ -78,6 +81,7 @@ const char *ac_names[] = {
 [AC_PDE_DIRTY] = pde.d,
 [AC_PDE_PSE] = pde.pse,
 [AC_PDE_NX] = pde.nx,
+[AC_PDE_BIT51] = pde.51,
 [AC_ACCESS_WRITE] = write,
 [AC_ACCESS_USER] = user,
 [AC_ACCESS_FETCH] = fetch,
@@ -293,6 +297,7 @@ void ac_test_reset_pt_pool(ac_test_t *at)
 void ac_test_setup_pte(ac_test_t *at)
 {
 unsigned long root = read_cr3();
+int pde_valid, pte_valid;
 
 if (!ac_test_enough_room(at))
ac_test_reset_pt_pool(at);
@@ -328,6 +333,8 @@ void ac_test_setup_pte(ac_test_t *at)
pte |= PT_DIRTY_MASK;
if (at-flags[AC_PDE_NX])
pte |= PT_NX_MASK;
+   if (at-flags[AC_PDE_BIT51])
+   pte |= 1ull  51;
at-pdep = vroot[index];
break;
case 1:
@@ -344,6 +351,8 @@ void ac_test_setup_pte(ac_test_t *at)
pte |= PT_DIRTY_MASK;
if (at-flags[AC_PTE_NX])
pte |= PT_NX_MASK;
+   if (at-flags[AC_PTE_BIT51])
+   pte |= 1ull  51;
at-ptep = vroot[index];
break;
}
@@ -357,10 +366,15 @@ void ac_test_setup_pte(ac_test_t *at)
 at-expected_fault = 0;
 at-expected_error = PFERR_PRESENT_MASK;
 
+pde_valid = at-flags[AC_PDE_PRESENT]
+ !at-flags[AC_PDE_BIT51];
+pte_valid = pde_valid
+ at-flags[AC_PTE_PRESENT]
+ !at-flags[AC_PTE_BIT51];
 if (at-flags[AC_ACCESS_TWICE]) {
-   if (at-flags[AC_PDE_PRESENT]) {
+   if (pde_valid) {
at-expected_pde |= PT_ACCESSED_MASK;
-   if (at-flags[AC_PTE_PRESENT])
+   if (pte_valid)
at-expected_pte |= PT_ACCESSED_MASK;
}
 }
@@ -377,6 +391,9 @@ void ac_test_setup_pte(ac_test_t *at)
 if (!at-flags[AC_PDE_PRESENT]) {
at-expected_fault = 1;
at-expected_error = ~PFERR_PRESENT_MASK;
+} else if (!pde_valid) {
+at-expected_fault = 1;
+at-expected_error |= PFERR_RESERVED_MASK;
 }
 
 if (at-flags[AC_ACCESS_USER]  !at-flags[AC_PDE_USER])
@@ -404,6 +421,9 @@ void ac_test_setup_pte(ac_test_t *at)
 if (!at-flags[AC_PTE_PRESENT]) {
at-expected_fault = 1;
at-expected_error = ~PFERR_PRESENT_MASK;
+} else if (!pte_valid) {
+at-expected_fault = 1;
+at-expected_error |= PFERR_RESERVED_MASK;
 }
 
 if (at-flags[AC_ACCESS_USER]  !at-flags[AC_PTE_USER])
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: MMU: Use different shadows when EFER.NXE changes

2009-03-31 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

A pte that is shadowed when the guest EFER.NXE=1 is not valid when
EFER.NXE=0; if bit 63 is set, the pte should cause a fault, and since the
shadow EFER always has NX enabled, this won't happen.

Fix by using a different shadow page table for different EFER.NXE bits.  This
allows vcpus to run correctly with different values of EFER.NXE, and for
transitions on this bit to be handled correctly without requiring a full
flush.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 548b97d..3fc4623 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -185,6 +185,7 @@ union kvm_mmu_page_role {
unsigned access:3;
unsigned invalid:1;
unsigned cr4_pge:1;
+   unsigned nxe:1;
};
 };
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9702353..bb30169 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -519,6 +519,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
efer |= vcpu-arch.shadow_efer  EFER_LMA;
 
vcpu-arch.shadow_efer = efer;
+
+   vcpu-arch.mmu.base_role.nxe = (efer  EFER_NX)  !tdp_enabled;
+   kvm_mmu_reset_context(vcpu);
 }
 
 void kvm_enable_efer_bits(u64 mask)
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] KVM: remove pointless conditional before kfree() in lapic initialization

2009-03-31 Thread Avi Kivity
From: Wei Yongjun yj...@cn.fujitsu.com

Remove pointless conditional before kfree().

Signed-off-by: Wei Yongjun yj...@cn.fujitsu.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bb30169..aeb0193 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1588,8 +1588,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
}
 out:
-   if (lapic)
-   kfree(lapic);
+   kfree(lapic);
return r;
 }
 
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: Use rsvd_bits_mask in load_pdptrs for cleanup and considing EXB bit

2009-03-31 Thread Dong, Eddie
Neiger, Gil wrote:
 PDPTEs are used only if CR0.PG=CR4.PAE=1.
 
 In that situation, their format depends the value of IA32_EFER.LMA.
 
 If IA32_EFER.LMA=0, bit 63 is reserved and must be 0 in any PDPTE
 that is marked present.  The execute-disable setting of a page is
 determined only by the PDE and PTE.  
 
 If IA32_EFER.LMA=1, bit 63 is used for the execute-disable in PML4
 entries, PDPTEs, PDEs, and PTEs (assuming IA32_EFER.NXE=1). 
 
   - Gil

Rebased.
Thanks, eddie


commit 032caed3da123950eeb3e192baf444d4eae80c85
Author: root r...@eddie-wb.localdomain
Date:   Tue Mar 31 16:22:49 2009 +0800

Use rsvd_bits_mask in load_pdptrs and remove bit 5-6 from rsvd_bits_mask 
per latest SDM.

Signed-off-by: Eddie Dong eddie.d...@intel.com

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2eab758..1bed3aa 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -225,11 +225,6 @@ static int is_nx(struct kvm_vcpu *vcpu)
return vcpu-arch.shadow_efer  EFER_NX;
 }
 
-static int is_present_pte(unsigned long pte)
-{
-   return pte  PT_PRESENT_MASK;
-}
-
 static int is_shadow_present_pte(u64 pte)
 {
return pte != shadow_trap_nonpresent_pte
@@ -2199,6 +2194,9 @@ void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int 
level)
context-rsvd_bits_mask[1][0] = 0;
break;
case PT32E_ROOT_LEVEL:
+   context-rsvd_bits_mask[0][2] =
+   rsvd_bits(maxphyaddr, 63) |
+   rsvd_bits(7, 8) | rsvd_bits(1, 2);  /* PDPTE */
context-rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62);  /* PDE */
context-rsvd_bits_mask[0][0] = exb_bit_rsvd |
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 258e5d5..2a6eb50 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return vcpu-arch.cr0  X86_CR0_PG;
 }
 
+static inline int is_present_pte(unsigned long pte)
+{
+   return pte  PT_PRESENT_MASK;
+}
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 961bd2b..b449ff0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -233,7 +233,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
goto out;
}
for (i = 0; i  ARRAY_SIZE(pdpte); ++i) {
-   if ((pdpte[i]  1)  (pdpte[i]  0xfff001e6ull)) {
+   if (is_present_pte(pdpte[i]) 
+   (pdpte[i]  vcpu-arch.mmu.rsvd_bits_mask[0][2])) {
ret = 0;
goto out;
}

cr3_load_rsvd.patch
Description: cr3_load_rsvd.patch


Re: RFC: Add reserved bits check

2009-03-31 Thread Avi Kivity

Dong, Eddie wrote:

+   case PT64_ROOT_LEVEL:
+   context-rsvd_bits_mask[0][3] = exb_bit_rsvd |
+   rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+   context-rsvd_bits_mask[0][2] = exb_bit_rsvd |
+   rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+   context-rsvd_bits_mask[0][1] = exb_bit_rsvd |
+   rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+   context-rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51)


I added a test for this and it noticed the pte bits missed nx.  I fixed 
that up.  I also added code to shadow into different pages when EFER.NXE 
changes, so that we can handle the transition without flushing all 
shadow (and also run vcpus with mismatched EFER.NX).


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: strange guest slowness after some time

2009-03-31 Thread Tomasz Chmielewski

Felix Leimbach schrieb:

Tomasz Chmielewski wrote:

Felix Leimbach schrieb:

Out of 3 e1000 guests none has ever been hit.

Observed with kvm-83 and kvm-84 with the host running in-kernel KVM 
code (linux 2.6.25.7)

Could you add a (unused) e1000 interface to your virtio guests?
As this issue happens rarely for me, maybe you could help to reproduce 
it as well (i.e. if network gets slow on virtio interface, give e1000 
a IP address, and try if network is also slow on e1000 on the very 
same guest).

Will do and report


BTW, what CPU do you have?

One dual core Opteron 2212
Note: I will upgrade to two Shanghai Quad-Cores in 2 weeks and test with 
those as well.


I have this slowness on an Intel CPU as well, after about 10 days of 
guest uptime (using virtio net):


processor   : 1
vendor_id   : GenuineIntel
cpu family  : 6
model   : 15
model name  : Intel(R) Xeon(R) CPU3050  @ 2.13GHz
stepping: 6
cpu MHz : 2133.410
cache size  : 2048 KB
physical id : 0
siblings: 2
core id : 1
cpu cores   : 2
fpu : yes
fpu_exception   : yes
cpuid level : 10
wp  : yes
flags   : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge 
mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe 
syscall lm constant_tsc arch_perfmon pebs bts rep_good pni monitor 
ds_cpl vmx est tm2 ssse3 cx16 xtpr lahf_lm

bogomips: 4266.87
clflush size: 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:


--
Tomasz Chmielewski
http://wpkg.org
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: remove pointless conditional before kfree()

2009-03-31 Thread Avi Kivity
Wei Yongjun wrote:
 Remove pointless conditional before kfree().

   

Applied, thanks.

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Use rsvd_bits_mask in load_pdptrs for cleanup and considing EXB bit

2009-03-31 Thread Avi Kivity

Dong, Eddie wrote:

Neiger, Gil wrote:
  

PDPTEs are used only if CR0.PG=CR4.PAE=1.

In that situation, their format depends the value of IA32_EFER.LMA.

If IA32_EFER.LMA=0, bit 63 is reserved and must be 0 in any PDPTE
that is marked present.  The execute-disable setting of a page is
determined only by the PDE and PTE.  


If IA32_EFER.LMA=1, bit 63 is used for the execute-disable in PML4
entries, PDPTEs, PDEs, and PTEs (assuming IA32_EFER.NXE=1). 


- Gil



Rebased.
Thanks, eddie


  


Looks good, but doesn't apply; please check if you are working against 
the latest version.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Biweekly KVM Test report, kernel 0c7771... userspace 1223a0...

2009-03-31 Thread Amit Shah
On (Mon) Mar 30 2009 [18:02:16], Avi Kivity wrote:
 Amit Shah wrote:

 /*
  * Check whether the Architectural PerfMon supports
  * Unhalted Core Cycles Event or not.
  * NOTE: Corresponding bit = 0 in ebx indicates event present.
  */
 cpuid(10, (eax.full), ebx, unused, unused);
 if ((eax.split.mask_length
 (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
 (ebx  ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
 return 0;

   
 So I think it can be done.
 

 Only if the guest kernel (or module accessing those registers) look at
 the cpuid output, right? I checked this for the Kaspersky AV on Windows,
 the crash bug I was solving and that program doesn't seem to check
 cpuid.
   

 The only way to solve all possible cases is to implement the performance  
 counters MSRs.  That's not going to happen in a hurry, we're looking at  
 making the known cases work.

Looks like it does get solved...

 RHEL 5.3 is based on 2.6.18 and this patch appears to have entered in
 2.6.21. I saw this on 5.3 as well.
   

 The snippet I quoted came from RHEL 5.3.  It checks cpuid so we should  
 be able to make it fail gracefully.

Our default CPU is qemu64, which has cpuid till level 2. Trying with
-cpu core2duo, this doesn't happen (even without setting EBX to the 0x3f
value) on both, RHEL 5.3 and Win+Kaspersky guests.

qemu64 loosely models some AMD CPU type. I guess we can update it to
expose cpuid levels upto 10 (and leave xlevel to the current value).
That should take care of this.

Amit
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 4/4] Fix task switching.

2009-03-31 Thread Kohl, Bernhard (NSN - DE/Munich)
Jan Kiszka wrote:
 
 Gleb Natapov wrote:
  The patch fixes two problems with task switching.
  1. Back link is written to a wrong TSS.
  2. Instruction emulation is not needed if the reason for task switch
 is a task gate in IDT and access to it is caused by an 
 external even.
  
  2 is currently solved only for VMX since there is not 
 reliable way to
  skip an instruction in SVM. We should emulate it instead.
 
 Does this series fix all issues Bernhard, Thomas and Julian 
 stumbled over?
 
 Jan

I will try this today. Thanks.

Bernhard
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Live memory allocation?

2009-03-31 Thread Tomasz Chmielewski

Javier Guerra schrieb:

On Mon, Mar 30, 2009 at 10:15 AM, Tomasz Chmielewski man...@wpkg.org wrote:

Still, if there is free memory on host, why not use it for cache?


because it's best used on the guest;


It is correct, but not realistic from the administrative point of view.

Let's say you have several KVM hosts, each with 16 GB RAM.

Guests can come and go - so you give them only as much memory as they 
need (more or less).
In other words, normally, you don't create the first guest with 16 GB 
RAM assigned. Upon creation of the second guest 2 hours later, you don't 
stop guest 1, just to start both guests with 8 GB RAM a while later. And 
so on. And so on, stopping and starting a whole bunch of guests until 
each of them has 512 MB RAM.


No, not all guests support ballooning.
But for those which support ballooning, the easiest way to implement it 
would be to write a user-space daemon I guess.




so, not
cacheing already-cached data, it's free to cache other more important
things, or to keep more of the VMs memory on RAM.


Correct - if the host knew what the guest already cached, the host could 
use RAM for other things.


Anyway, there are still more pressing issues than that ;)


--
Tomasz Chmielewski
http://wpkg.org
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kvm binary names

2009-03-31 Thread Daniel P. Berrange
On Mon, Mar 30, 2009 at 05:12:30PM -0400, Bill Davidsen wrote:
 Daniel P. Berrange wrote:
 On Fri, Mar 20, 2009 at 10:57:50AM -0700, jd wrote:
 Hi
What is the motivation for having different kvm binary names on 
various linux distributions.. ? 
 -- kvm
 -- qemu-system-x86_84
 -- qemu-kvm
 
 I can tell you the history from the Fedora POV at least...
 
 We already had 'qemu', 'qemu-system-x86_64', etc from the existing
 plain qemu emulator RPMs we distributed.
 
 The KVM makefile creates a binary call qemu-system-x86_64 but this
 clashes with the existing QEMU RPM, so we had to rename it somehow
 to allow parallel installation of KVM and QEMU RPMs.
 
 KVM already ships with a python script called 'kvm' and we didn't
 want to clash with that either, so we eventually settled on calling
 it 'qemu-kvm'. Other distros didn't worry about clash with the python
 script so called their binary just 'kvm'
 
 Don't stop there, why does Fedora have both qemu-ppc and 
 qemu-system-ppc and so forth? There are many of these, arm and m68k 
 for instance. On x86 I assume that they are both emulated, and they are not 
 two names for the same executable or such, so what are they and how to 
 choose which to use?

Those are totally different things.  qemu-$ARCH  is a userspace
emulator, while qemu-system-$ARCH is a full machine emulator. 

The userspace emulator lets you directly execute binaries from the
other non-native arch. The machine emulator provides a complete
virtual machine where you can rnu an entire OS.


Daniel
-- 
|: Red Hat, Engineering, London   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org   -o- http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] kvm: remove pointless conditional before kfree()

2009-03-31 Thread Wei Yongjun
Remove pointless conditional before kfree().

Signed-off-by: Wei Yongjun yj...@cn.fujitsu.com
---
 arch/x86/kvm/x86.c |3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8ca100a..8fb4c92 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1580,8 +1580,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
}
 out:
-   if (lapic)
-   kfree(lapic);
+   kfree(lapic);
return r;
 }
 
-- 
1.5.3.8




--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Name kvm ambiguos

2009-03-31 Thread Oliver Rath

Hi List,

it is interesting, that most people at name kvm first think for 
vga-hardware switches. If you type kvm in google, you'll never find 
this project at the first place. Maybe (i know thats difficult to 
realize) it is better looking for another name? Like kevim? Its only a 
suggestion...


Regards,

Oliver

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Name kvm ambiguos

2009-03-31 Thread Daniel P. Berrange
On Tue, Mar 31, 2009 at 11:51:03AM +0200, Oliver Rath wrote:
 Hi List,
 
 it is interesting, that most people at name kvm first think for 
 vga-hardware switches. If you type kvm in google, you'll never find 
 this project at the first place. Maybe (i know thats difficult to 
 realize) it is better looking for another name? Like kevim? Its only a 
 suggestion...

Err, if I type 'kvm' into Google, this project is the #1 result...

  Main Page - KVM
   KVM (for Kernel-based Virtual Machine) is a full virtualization 
   solution for Linux on x86 hardware containing virtualization 
   extensions (Intel VT or AMD-V). ...
   www.linux-kvm.org/ - 9k - Cached - Similar pages

Daniel
-- 
|: Red Hat, Engineering, London   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org   -o- http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: IO on guest is 20 times slower than host

2009-03-31 Thread Joerg Roedel
On Sun, Mar 29, 2009 at 10:10:26PM +0300, Avi Kivity wrote:
 Avi Kivity wrote:
 Kurt Yoder wrote:
 slow host cpu information, core 1 of 16:

 processor   : 0
 vendor_id   : AuthenticAMD
 cpu family  : 16
 model   : 4
 model name  : Quad-Core AMD Opteron(tm) Processor 8382
 stepping: 2
 cpu MHz : 2611.998
 cache size  : 512 KB
 physical id : 0
 siblings: 4
 core id : 0
 cpu cores   : 4
 apicid  : 0
 initial apicid  : 0
 fpu : yes
 fpu_exception   : yes
 cpuid level : 5
 wp  : yes
 flags   : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr  
 pge mca
 cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall mmxext fxsr_opt
 pdpe1gb rdtscp lm 3dnowext 3dnow constant_tsc rep_good nopl pni monitor
 cx16 popcnt lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a
 misalignsse 3dnowprefetch osvw ibs skinit wdt
 bogomips: 5223.97
 TLB size: 1024 4K pages
 clflush size: 64
 cache_alignment : 64
 address sizes   : 48 bits physical, 48 bits virtual
 power management: ts ttp tm stc 100mhzsteps hwpstate


   

 Can you loading kvm_amd on this host with 'modprobe kvm-amd npt=0'?


 If it helps, then the guest is messing up the cpu cache.  Try the  
 attached patch.

 -- 
 I have a truly marvellous patch that fixes the bug which this
 signature is too narrow to contain.


 diff --git a/kernel/x86/kvm/svm.c b/kernel/x86/kvm/svm.c
 index 1fcbc17..d9774e9 100644
 --- a/kernel/x86/kvm/svm.c
 +++ b/kernel/x86/kvm/svm.c
 @@ -575,7 +575,7 @@ static void init_vmcb(struct vcpu_svm *svm)
   INTERCEPT_CR3_MASK);
   control-intercept_cr_write = ~(INTERCEPT_CR0_MASK|
INTERCEPT_CR3_MASK);
 - save-g_pat = 0x0007040600070406ULL;
 + save-g_pat = 0x0606060606060606ULL;
   /* enable caching because the QEMU Bios doesn't enable it */
   save-cr0 = X86_CR0_ET;
   save-cr3 = 0;

Yeah, that patch makes sense. But I think we need some more work on this
because the guest may change the pat msr afterwards. Best would be a simple
shadow of the pat msr. Last question is how this will effect pci passthrough.

Joerg


-- 
   | Advanced Micro Devices GmbH
 Operating | Karl-Hammerschmidt-Str. 34, 85609 Dornach bei München
 System| 
 Research  | Geschäftsführer: Jochen Polster, Thomas M. McCoy, Giuliano Meroni
 Center| Sitz: Dornach, Gemeinde Aschheim, Landkreis München
   | Registergericht München, HRB Nr. 43632

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: IO on guest is 20 times slower than host

2009-03-31 Thread Avi Kivity

Joerg Roedel wrote:

--- a/kernel/x86/kvm/svm.c
+++ b/kernel/x86/kvm/svm.c
@@ -575,7 +575,7 @@ static void init_vmcb(struct vcpu_svm *svm)
INTERCEPT_CR3_MASK);
control-intercept_cr_write = ~(INTERCEPT_CR0_MASK|
 INTERCEPT_CR3_MASK);
-   save-g_pat = 0x0007040600070406ULL;
+   save-g_pat = 0x0606060606060606ULL;
/* enable caching because the QEMU Bios doesn't enable it */
save-cr0 = X86_CR0_ET;
save-cr3 = 0;



Yeah, that patch makes sense. But I think we need some more work on this
because the guest may change the pat msr afterwards. Best would be a simple
shadow of the pat msr. Last question is how this will effect pci passthrough.   


This is just a stopgap; we can later add proper pat shadowing.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Name

2009-03-31 Thread Gerrit Slomma
Daniel P. Berrange berrange at redhat.com writes:

 
 On Tue, Mar 31, 2009 at 11:51:03AM +0200, Oliver Rath wrote:
  Hi List,
  
  it is interesting, that most people at name kvm first think for 
  vga-hardware switches. If you type kvm in google, you'll never find 
  this project at the first place. Maybe (i know thats difficult to 
  realize) it is better looking for another name? Like kevim? Its only a 
  suggestion...
 
 Err, if I type 'kvm' into Google, this project is the #1 result...
 
   Main Page - KVM
KVM (for Kernel-based Virtual Machine) is a full virtualization 
solution for Linux on x86 hardware containing virtualization 
extensions (Intel VT or AMD-V). ...
www.linux-kvm.org/ - 9k - Cached - Similar pages
 
 Daniel

He is likely to use the german version of google.
If you search kvm via google.de the #1 result is www.kvm-switch.de and all the 
propaganda left and top is also about kvm-switches. linux-kvm.org follows on #4 
though.

Gerrit

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] add ksm kernel shared memory driver.

2009-03-31 Thread Izik Eidus

KAMEZAWA Hiroyuki wrote:

On Tue, 31 Mar 2009 02:59:20 +0300
Izik Eidus iei...@redhat.com wrote:

  

Ksm is driver that allow merging identical pages between one or more
applications in way unvisible to the application that use it.
Pages that are merged are marked as readonly and are COWed when any
application try to change them.

Ksm is used for cases where using fork() is not suitable,
one of this cases is where the pages of the application keep changing
dynamicly and the application cannot know in advance what pages are
going to be identical.

Ksm works by walking over the memory pages of the applications it
scan in order to find identical pages.
It uses a two sorted data strctures called stable and unstable trees
to find in effective way the identical pages.

When ksm finds two identical pages, it marks them as readonly and merges
them into single one page,
after the pages are marked as readonly and merged into one page, linux
will treat this pages as normal copy_on_write pages and will fork them
when write access will happen to them.

Ksm scan just memory areas that were registred to be scanned by it.

Ksm api:

KSM_GET_API_VERSION:
Give the userspace the api version of the module.

KSM_CREATE_SHARED_MEMORY_AREA:
Create shared memory reagion fd, that latter allow the user to register
the memory region to scan by using:
KSM_REGISTER_MEMORY_REGION and KSM_REMOVE_MEMORY_REGION

KSM_START_STOP_KTHREAD:
Return information about the kernel thread, the inforamtion is returned
using the ksm_kthread_info structure:
ksm_kthread_info:
__u32 sleep:
number of microsecoends to sleep between each iteration of
scanning.

__u32 pages_to_scan:
number of pages to scan for each iteration of scanning.

__u32 max_pages_to_merge:
maximum number of pages to merge in each iteration of scanning
(so even if there are still more pages to scan, we stop this
iteration)

__u32 flags:
   flags to control ksmd (right now just ksm_control_flags_run
  available)

KSM_REGISTER_MEMORY_REGION:
Register userspace virtual address range to be scanned by ksm.
This ioctl is using the ksm_memory_region structure:
ksm_memory_region:
__u32 npages;
 number of pages to share inside this memory region.
__u32 pad;
__u64 addr:
the begining of the virtual address of this region.

KSM_REMOVE_MEMORY_REGION:
Remove memory region from ksm.

Signed-off-by: Izik Eidus iei...@redhat.com
---
 include/linux/ksm.h|   69 +++
 include/linux/miscdevice.h |1 +
 mm/Kconfig |6 +
 mm/Makefile|1 +
 mm/ksm.c   | 1431 
 5 files changed, 1508 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/ksm.h
 create mode 100644 mm/ksm.c

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
new file mode 100644
index 000..5776dce
--- /dev/null
+++ b/include/linux/ksm.h
@@ -0,0 +1,69 @@
+#ifndef __LINUX_KSM_H
+#define __LINUX_KSM_H
+
+/*
+ * Userspace interface for /dev/ksm - kvm shared memory
+ */
+
+#include linux/types.h
+#include linux/ioctl.h
+
+#include asm/types.h
+
+#define KSM_API_VERSION 1
+
+#define ksm_control_flags_run 1
+
+/* for KSM_REGISTER_MEMORY_REGION */
+struct ksm_memory_region {
+   __u32 npages; /* number of pages to share */
+   __u32 pad;
+   __u64 addr; /* the begining of the virtual address */
+__u64 reserved_bits;
+};
+
+struct ksm_kthread_info {
+   __u32 sleep; /* number of microsecoends to sleep */
+   __u32 pages_to_scan; /* number of pages to scan */
+   __u32 flags; /* control flags */
+__u32 pad;
+__u64 reserved_bits;
+};
+
+#define KSMIO 0xAB
+
+/* ioctls for /dev/ksm */
+
+#define KSM_GET_API_VERSION  _IO(KSMIO,   0x00)
+/*
+ * KSM_CREATE_SHARED_MEMORY_AREA - create the shared memory reagion fd
+ */
+#define KSM_CREATE_SHARED_MEMORY_AREA_IO(KSMIO,   0x01) /* return SMA fd */
+/*
+ * KSM_START_STOP_KTHREAD - control the kernel thread scanning speed
+ * (can stop the kernel thread from working by setting running = 0)
+ */
+#define KSM_START_STOP_KTHREAD  _IOW(KSMIO,  0x02,\
+ struct ksm_kthread_info)
+/*
+ * KSM_GET_INFO_KTHREAD - return information about the kernel thread
+ * scanning speed.
+ */
+#define KSM_GET_INFO_KTHREAD_IOW(KSMIO,  0x03,\
+ struct ksm_kthread_info)
+
+
+/* ioctls for SMA fds */
+
+/*
+ * KSM_REGISTER_MEMORY_REGION - register virtual address memory area to be
+ * scanned by kvm.
+ */
+#define KSM_REGISTER_MEMORY_REGION   _IOW(KSMIO,  0x20,\
+ struct ksm_memory_region)
+/*
+ * KSM_REMOVE_MEMORY_REGION - remove virtual address memory area from ksm.
+ */
+#define KSM_REMOVE_MEMORY_REGION _IO(KSMIO,   0x21)
+
+#endif
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 

Re: [PATCH 4/4] add ksm kernel shared memory driver.

2009-03-31 Thread Izik Eidus

Anthony Liguori wrote:

Izik Eidus wrote:

Ksm is driver that allow merging identical pages between one or more
applications in way unvisible to the application that use it.
Pages that are merged are marked as readonly and are COWed when any
application try to change them.

Ksm is used for cases where using fork() is not suitable,
one of this cases is where the pages of the application keep changing
dynamicly and the application cannot know in advance what pages are
going to be identical.

Ksm works by walking over the memory pages of the applications it
scan in order to find identical pages.
It uses a two sorted data strctures called stable and unstable trees
to find in effective way the identical pages.

When ksm finds two identical pages, it marks them as readonly and merges
them into single one page,
after the pages are marked as readonly and merged into one page, linux
will treat this pages as normal copy_on_write pages and will fork them
when write access will happen to them.

Ksm scan just memory areas that were registred to be scanned by it.

Ksm api:

KSM_GET_API_VERSION:
Give the userspace the api version of the module.

KSM_CREATE_SHARED_MEMORY_AREA:
Create shared memory reagion fd, that latter allow the user to register
the memory region to scan by using:
KSM_REGISTER_MEMORY_REGION and KSM_REMOVE_MEMORY_REGION

KSM_START_STOP_KTHREAD:
Return information about the kernel thread, the inforamtion is returned
using the ksm_kthread_info structure:
ksm_kthread_info:
__u32 sleep:
number of microsecoends to sleep between each iteration of
scanning.

__u32 pages_to_scan:
number of pages to scan for each iteration of scanning.

__u32 max_pages_to_merge:
maximum number of pages to merge in each iteration of scanning
(so even if there are still more pages to scan, we stop this
iteration)

__u32 flags:
   flags to control ksmd (right now just ksm_control_flags_run
  available)
  


Wouldn't this make more sense as a sysfs interface?


I belive using ioctl for registering memory of applications make it 
easier
Ksm doesnt have any complicated API that would benefit from sysfs 
(beside adding more complexity)


That is, the KSM_START_STOP_KTHREAD part, not necessarily the rest of 
the API.


What you mean?


Regards,

Anthony Liguori



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: new wiki missing pages? / new wiki for kvm

2009-03-31 Thread sudhir kumar
hugetlbfs info missing on new wiki. the info is here
http://il.qumranet.com/kvmwiki/UsingLargePages

On Wed, Mar 11, 2009 at 4:38 AM, Dor Laor dl...@redhat.com wrote:
 Hollis Blanchard wrote:

 On Tue, 2009-03-10 at 22:49 +0200, Dor Laor wrote:


 Sorry for that. It took IT only few month to change the Wiki... During
 this tight schedule
 some pages got lost as you can see.. Please report on a
 problematic/missing page.


 Are these emails sufficient, or are you asking us to report some other
 way?


 It is sufficient, I meant that all of the content writers should double
 check. Thanks.



 The original content can be reached using http://il.qumranet.com/kvmwiki


 Please restore all pages linked from here:
 http://il.qumranet.com/kvmwiki/CategoryPowerPC


 Sure



 In general, finally the kvm wiki just moved from qumranet.kvm.com to
 www.linux-kvm.org.


 It's very confusing that linux-kvm.com and linux-kvm.org are apparently
 completely unrelated. I wonder why you chose to create .org when .com
 already existed.


 You're right, I didn't pick this. Also we need to get rid of the old usage
 for kvm acronyms :)
 If we move to qemu wiki the problem will vanish.



 We're considering an option to unite the kvm and qemu wikis together
 since there is allot
 of shared content and eventually we'll have a shared userspace
 executable.


 That would be great! First we'll need a working qemu wiki though...
 maybe you can solve that problem at the same time.

 What timeframe are we talking about? Next week? 6 months? Just a
 brainstorm?


 One of the qemu maintainers handles it.
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html




-- 
Sudhir Kumar
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: new wiki missing pages? / new wiki for kvm

2009-03-31 Thread Avi Kivity

sudhir kumar wrote:

hugetlbfs info missing on new wiki. the info is here
http://il.qumranet.com/kvmwiki/UsingLargePages

  


Adding smintz.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/4] ksm - dynamic page sharing driver for linux

2009-03-31 Thread Izik Eidus

Anthony Liguori wrote:

Izik Eidus wrote:

I am sending another seires of patchs for kvm kernel and kvm-userspace
that would allow users of kvm to test ksm with it.
The kvm patchs would apply to Avi git tree.
  
Any reason to not take these through upstream QEMU instead of 
kvm-userspace?  In principle, I don't see anything that would prevent 
normal QEMU from almost making use of this functionality.  That would 
make it one less thing to eventually have to merge...


The changes for the kvm-userspace were just provided for testing it...
After we will have ksm inside the kernel we will send another patch to 
qemu-devel that will add support for it.




Regards,

Anthony Liguori


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Segfault while booting Windows XP x64

2009-03-31 Thread Mike Kelly
On Tue, 31 Mar 2009 08:54:48 +0300
Gleb Natapov g...@redhat.com wrote:

 On Mon, Mar 30, 2009 at 11:26:52PM -0400, Mike Kelly wrote:
  I'm on a Intel(R) Core(TM)2 Duo CPU T7500 @ 2.20GHz, using a 2.6.29
  vanilla kernel, x86_64. kvm userland version 84.
  
  When I try to boot my x64 Windows XP, it gets partway through the
  windows booting process, with the progress bar and what not. Then, I
  get the attached backtrace.
  
  The various -no-kvm options don't seem to make a difference.
  
  I created, and was able to boot, this image using linux 2.6.28. I'll
  give it a shot again later to confirm that is still the case.
  
 Are you sure you have write permission to that image?

Hmm, I thought I did, but looks like I messed up my mount this time
around. Dang.

That still shouldn't cause a segfault, though. But, yes, fixing my
mount fixes the crash.

-- 
Mike Kelly
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Segfault while booting Windows XP x64

2009-03-31 Thread Gleb Natapov
On Tue, Mar 31, 2009 at 08:50:25AM -0400, Mike Kelly wrote:
 On Tue, 31 Mar 2009 08:54:48 +0300
 Gleb Natapov g...@redhat.com wrote:
 
  On Mon, Mar 30, 2009 at 11:26:52PM -0400, Mike Kelly wrote:
   I'm on a Intel(R) Core(TM)2 Duo CPU T7500 @ 2.20GHz, using a 2.6.29
   vanilla kernel, x86_64. kvm userland version 84.
   
   When I try to boot my x64 Windows XP, it gets partway through the
   windows booting process, with the progress bar and what not. Then, I
   get the attached backtrace.
   
   The various -no-kvm options don't seem to make a difference.
   
   I created, and was able to boot, this image using linux 2.6.28. I'll
   give it a shot again later to confirm that is still the case.
   
  Are you sure you have write permission to that image?
 
 Hmm, I thought I did, but looks like I messed up my mount this time
 around. Dang.
 
 That still shouldn't cause a segfault, though. But, yes, fixing my
 mount fixes the crash.
 
This crash is known and fix is been working on. It happens on IO
cancellation path and usually you get there if you don't have write
permission to you image.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Segfault while booting Windows XP x64

2009-03-31 Thread Mike Kelly
On Tue, 31 Mar 2009 15:53:06 +0300
Gleb Natapov g...@redhat.com wrote:

 This crash is known and fix is been working on. It happens on IO
 cancellation path and usually you get there if you don't have write
 permission to you image.

Ok, cool. Thanks for the help w/ my stupidity.

-- 
Mike Kelly
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] add ksm kernel shared memory driver.

2009-03-31 Thread Anthony Liguori

Izik Eidus wrote:


I belive using ioctl for registering memory of applications make it 
easier


Yes, I completely agree.

Ksm doesnt have any complicated API that would benefit from sysfs 
(beside adding more complexity)


That is, the KSM_START_STOP_KTHREAD part, not necessarily the rest of 
the API.


What you mean?


The ioctl(KSM_START_STOP_KTHREAD) API is distinct from the rest of the 
API.  Whereas the rest of the API is used by applications to register 
their memory with KSM, this API is used by ksmctl to allow parameters to 
be tweaked in userspace.


These parameters are just simple values like enable, pages_to_scan, 
sleep_time.  Then there is KSM_GET_INFO_KTHREAD which provides a read 
interface to these parameters.


You could drop KSM_START_STOP_KTHREAD and KSM_GET_INFO_KTHREAD 
altogether, and introduce a sysfs hierarchy:


/sysfs/some/path/ksm/{enable,pages_to_scan,sleep_time}

That eliminates the need for ksmctl altogether, cleanly separates the 
two APIs, and provides a stronger interface.


The main problem with the current API is that it uses a single device to 
do both the administrative task and the userspace interface.  That means 
that any application that has access to registering its memory with KSM 
also has the ability to disable KSM.  That seems like a security concern 
to me since registering a memory region ought to be an unprivileged 
action whereas enabling/disabling KSM ought to be a privileged action.


Regards,

Anthony Liguori



Regards,

Anthony Liguori





--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[ kvm-Bugs-2723366 ] ltp diotest running time is 2.54 times than before

2009-03-31 Thread SourceForge.net
Bugs item #2723366, was opened at 2009-03-31 07:00
Message generated for change (Tracker Item Submitted) made by jiajun
You can respond by visiting: 
https://sourceforge.net/tracker/?func=detailatid=893831aid=2723366group_id=180599

Please note that this message will contain a full copy of the comment thread,
including the initial issue submission, for this request,
not just the latest update.
Category: qemu
Group: None
Status: Open
Resolution: None
Priority: 5
Private: No
Submitted By: Jiajun Xu (jiajun)
Assigned to: Nobody/Anonymous (nobody)
Summary: ltp diotest running time is 2.54 times than before

Initial Comment:
Running LTP diotest in guest costs about 2m22s in latest commit which it needs 
0m56s in older commit.

New kvm kernel commit:a317a1e496b22d1520218ecf16a02498b99645e2 
kvm user-space commit: df0e52a8d988d55dd42f8d46faffa9faa41892c9
ltp diotest running time: 5 rounds of diotest average time: 2m22s

Old kvm kernel commit:e74bb3fa8e55284dc6fdd68aa9da833ce07a4295
kvm user-space commit: 4c1083fd610fba4eedb45553c0a579b7b6593f1a
ltp diotest running time: 5 rounds of diotest average time: 0m56s

Reproduce steps:
(1)qemu-system-x86_64  -m 512 -smp 4  -net 
nic,macaddr=00:16:3e:12:0d:3c,model=rtl8139 -net tap,script=/etc/kvm/qemu-ifup 
-hda /share/xvs/var/ia32e_rhel4u1.img
(2)Get LTP package from sourceforge and install it in guest
(3)Run diotest for 5 times.
while [ $i -lt 5 ]
do
time /ltp-full-20070930/runltp -l /tmp/ltp_dio.log -f dio -p -q 
/root/ltp.log
((i++))
done

Notes:
The data is running with ltp-full-20070930, and we also tried latest ltp 
package ltp-full-20090228, which can get similar data.


--

You can respond by visiting: 
https://sourceforge.net/tracker/?func=detailatid=893831aid=2723366group_id=180599
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] add ksm kernel shared memory driver.

2009-03-31 Thread Andrea Arcangeli
On Tue, Mar 31, 2009 at 08:31:31AM -0500, Anthony Liguori wrote:
 You could drop KSM_START_STOP_KTHREAD and KSM_GET_INFO_KTHREAD altogether, 
 and introduce a sysfs hierarchy:

 /sysfs/some/path/ksm/{enable,pages_to_scan,sleep_time}

Introducing a sysfs hierarchy sounds a bit of overkill.

 the ability to disable KSM.  That seems like a security concern to me since 
 registering a memory region ought to be an unprivileged action whereas 
 enabling/disabling KSM ought to be a privileged action.

sysfs files would then only be writeable by admin, so if we want to
allow only admin to start/stop/tune ksm it'd be enough to plug an
admin capability check in the ioctl to provide equivalent permissions.

I could imagine converting the enable/pages_to_scan/sleep_time to
module params and tweaking them through /sys/module/ksm/parameters,
but for enable to work that way, we'd need to intercept the write so
we can at least weakup the kksmd daemon, which doesn't seem possible
with /sys/module/ksm/parameters, so in the end if we stick to the
ioctl for registering regions, it seems simpler to use it for
start/stop/tune too.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: new wiki missing pages? / new wiki for kvm

2009-03-31 Thread Shahar Mintz
Added:

http://www.linux-kvm.org/page/UsingLargePages

Many thanks!

Shahar Mintz smi...@redhat.com
 
IT and Infrastructure
Red Hat Israel, Ra'anana
 
Phone: +972 9 7754666
Extension: 5106
IRC: smintz
GnuPG: FFEC 6A38 420D 288A 0D16  EEE0 4D5D 287A 5686 23CC



Avi Kivity wrote:
 sudhir kumar wrote:
 hugetlbfs info missing on new wiki. the info is here
 http://il.qumranet.com/kvmwiki/UsingLargePages

   

 Adding smintz.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] add ksm kernel shared memory driver.

2009-03-31 Thread Anthony Liguori

Andrea Arcangeli wrote:

the ability to disable KSM.  That seems like a security concern to me since 
registering a memory region ought to be an unprivileged action whereas 
enabling/disabling KSM ought to be a privileged action.



sysfs files would then only be writeable by admin, so if we want to
allow only admin to start/stop/tune ksm it'd be enough to plug an
admin capability check in the ioctl to provide equivalent permissions.
  


Caps are not very granular unless you introduce a new capability.  
Furthermore, it's a bit more difficult to associate a capability with a 
user/group.


With sysfs, you use file based permissions to control the API.  It also 
fits into things like selinux a lot better.


In the very least, if you insist on not using sysfs, you should have a 
separate character device that's used for control (like /dev/ksmctl).


Regards,

Anthony Liguori


I could imagine converting the enable/pages_to_scan/sleep_time to
module params and tweaking them through /sys/module/ksm/parameters,
but for enable to work that way, we'd need to intercept the write so
we can at least weakup the kksmd daemon, which doesn't seem possible
with /sys/module/ksm/parameters, so in the end if we stick to the
ioctl for registering regions, it seems simpler to use it for
start/stop/tune too.
  


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Gerry Reno
Today we upgraded one of our VM's from F9 to F10 and after the first 
reboot we see the dreaded GRUB prompt.  This it turns out is a known 
problem with F10 installs.  And the recovery is usually very simple.  
You boot into rescue mode from CDROM and reinstall the boot loader.  The 
problem we're seeing is that even though I select CDROM from the boot 
menu, it will never boot from the CDROM.  It always has an error.  What 
can we do to get this VM to boot from the CDROM drive so that we can 
install a  new bootloader and recover this VM?


Regards,
Gerry

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Tomasz Chmielewski

Gerry Reno schrieb:
Today we upgraded one of our VM's from F9 to F10 and after the first 
reboot we see the dreaded GRUB prompt.  This it turns out is a known 
problem with F10 installs.  And the recovery is usually very simple.  
You boot into rescue mode from CDROM and reinstall the boot loader.  The 
problem we're seeing is that even though I select CDROM from the boot 
menu, it will never boot from the CDROM.  It always has an error.


What error?


What 
can we do to get this VM to boot from the CDROM drive so that we can 
install a  new bootloader and recover this VM?


What parameters do you use to start the guest?


--
Tomasz Chmielewski
http://wpkg.org
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


EPT support breakage on: KVM: VMX: Zero ept module parameter if ept is not present

2009-03-31 Thread Andrew Theurer

I cannot get EPT support to work on commit:
21f65ab2c582594a69dcb1484afa9f88b3414b4f
KVM: VMX: Zero ept module parameter if ept is not present

I see tons of pf_guest from kvm_stat, where as the previous commit has none.
I am using ept=1 module option for kvm-intel.

This is on Nehalem processors.

-Andrew


commit diff:

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8b1b9b8..96a19f8 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -265,7 +265,7 @@ static inline int cpu_has_vmx_ept(void)

static inline int vm_need_ept(void)
{
-   return (cpu_has_vmx_ept()  enable_ept);
+   return enable_ept;
}

static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
@@ -1205,6 +1205,9 @@ static __init int setup_vmcs_config(struct 
vmcs_config *vmcs_conf)

   if (!cpu_has_vmx_vpid())
   enable_vpid = 0;

+   if (!cpu_has_vmx_ept())
+   enable_ept = 0;
+
   min = 0;
#ifdef CONFIG_X86_64
   min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] add ksm kernel shared memory driver.

2009-03-31 Thread Andrea Arcangeli
On Tue, Mar 31, 2009 at 09:37:17AM -0500, Anthony Liguori wrote:
 In the very least, if you insist on not using sysfs, you should have a 
 separate character device that's used for control (like /dev/ksmctl).

I'm fine to use sysfs that's not the point, if you've to add a ksmctl
device, then sysfs is surely better. Besides ksm would normally be
enabled at boot, tasks jailed by selinux will better not start/stop
this thing.

If people wants /sys/kernel/mm/ksm instead of the start_stop ioctl we
surely can add it (provided there's a way to intercept write to the
sysfs file). Problem is registering memory could also be done with
'echo 0 -1 /proc/self/ksm' and be inherited by childs, it's not just
start/stop. I mean this is more a matter of taste I'm
afraid... Personally I'm more concerned about the registering of the
ram API than the start/stop thing which I cannot care less about, so
my logic is that as long as this pseudodevice exists, we should use it
for everything. If we go away from it, then we should remove it as a
whole.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: Use rsvd_bits_mask in load_pdptrs for cleanup and considing EXB bit

2009-03-31 Thread Dong, Eddie

 
 Looks good, but doesn't apply; please check if you are working against
 the latest version.

Rebased on top of a317a1e496b22d1520218ecf16a02498b99645e2 + previous rsvd bits 
violation check patch.

thx, eddie



Use rsvd_bits_mask in load_pdptrs and remove bit 5-6 from rsvd_bits_mask 
per latest SDM.

Signed-off-by: Eddie Dong eddie.d...@intel.com


diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 41a0482..400c056 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -225,11 +225,6 @@ static int is_nx(struct kvm_vcpu *vcpu)
return vcpu-arch.shadow_efer  EFER_NX;
 }
 
-static int is_present_pte(unsigned long pte)
-{
-   return pte  PT_PRESENT_MASK;
-}
-
 static int is_shadow_present_pte(u64 pte)
 {
return pte != shadow_trap_nonpresent_pte
@@ -2195,6 +2190,9 @@ void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int 
level)
context-rsvd_bits_mask[1][0] = 0;
break;
case PT32E_ROOT_LEVEL:
+   context-rsvd_bits_mask[0][2] =
+   rsvd_bits(maxphyaddr, 63) |
+   rsvd_bits(7, 8) | rsvd_bits(1, 2);  /* PDPTE */
context-rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62);  /* PDE */
context-rsvd_bits_mask[0][0] = exb_bit_rsvd |
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index eaab214..3494a2f 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return vcpu-arch.cr0  X86_CR0_PG;
 }
 
+static inline int is_present_pte(unsigned long pte)
+{
+   return pte  PT_PRESENT_MASK;
+}
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9702353..3d07c9a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -234,7 +234,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
goto out;
}
for (i = 0; i  ARRAY_SIZE(pdpte); ++i) {
-   if ((pdpte[i]  1)  (pdpte[i]  0xfff001e6ull)) {
+   if (is_present_pte(pdpte[i]) 
+   (pdpte[i]  vcpu-arch.mmu.rsvd_bits_mask[0][2])) {
ret = 0;
goto out;
}--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Gerry Reno

Tomasz Chmielewski wrote:

Gerry Reno schrieb:
Today we upgraded one of our VM's from F9 to F10 and after the first 
reboot we see the dreaded GRUB prompt.  This it turns out is a known 
problem with F10 installs.  And the recovery is usually very simple.  
You boot into rescue mode from CDROM and reinstall the boot loader.  
The problem we're seeing is that even though I select CDROM from the 
boot menu, it will never boot from the CDROM.  It always has an error.


What error?


Boot Failure Code:  0003
Boot from CDROM failed:  cannot read the boot disk.
FATAL: No bootable device.

The host has only a DVD drive and we are using the DVD F10 install disk.



What can we do to get this VM to boot from the CDROM drive so that we 
can install a  new bootloader and recover this VM?


What parameters do you use to start the guest?



I'm using the GUI VMM and selecting Run on that VM.


Regards,
Gerry




--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] add ksm kernel shared memory driver.

2009-03-31 Thread Anthony Liguori

Andrea Arcangeli wrote:

On Tue, Mar 31, 2009 at 09:37:17AM -0500, Anthony Liguori wrote:
  
In the very least, if you insist on not using sysfs, you should have a 
separate character device that's used for control (like /dev/ksmctl).



I'm fine to use sysfs that's not the point, if you've to add a ksmctl
device, then sysfs is surely better. Besides ksm would normally be
enabled at boot, tasks jailed by selinux will better not start/stop
this thing.

If people wants /sys/kernel/mm/ksm instead of the start_stop ioctl we
surely can add it (provided there's a way to intercept write to the
sysfs file). Problem is registering memory could also be done with
'echo 0 -1 /proc/self/ksm' and be inherited by childs, it's not just
start/stop. I mean this is more a matter of taste I'm
afraid... Personally I'm more concerned about the registering of the
ram API than the start/stop thing which I cannot care less about,


I don't think the registering of ram should be done via sysfs.  That 
would be a pretty bad interface IMHO.  But I do think the functionality 
that ksmctl provides along with the security issues I mentioned earlier 
really suggest that there ought to be a separate API for control vs. 
registration and that control API would make a lot of sense as a sysfs API.


If you wanted to explore alternative APIs for registration, madvise() 
seems like the obvious candidate to me.


madvise(start, size, MADV_SHARABLE) seems like a pretty obvious API to me.

So combining a sysfs interface for control and an madvise() interface 
for registration seems like a really nice interface to me.


Regards,

Anthony Liguori


 so
my logic is that as long as this pseudodevice exists, we should use it
for everything. If we go away from it, then we should remove it as a
whole.
  


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Tomasz Chmielewski

Gerry Reno schrieb:

Tomasz Chmielewski wrote:

Gerry Reno schrieb:
Today we upgraded one of our VM's from F9 to F10 and after the first 
reboot we see the dreaded GRUB prompt.  This it turns out is a known 
problem with F10 installs.  And the recovery is usually very simple.  
You boot into rescue mode from CDROM and reinstall the boot loader.  
The problem we're seeing is that even though I select CDROM from the 
boot menu, it will never boot from the CDROM.  It always has an error.


What error?


Boot Failure Code:  0003
Boot from CDROM failed:  cannot read the boot disk.
FATAL: No bootable device.

The host has only a DVD drive and we are using the DVD F10 install disk.



What can we do to get this VM to boot from the CDROM drive so that we 
can install a  new bootloader and recover this VM?


What parameters do you use to start the guest?



I'm using the GUI VMM and selecting Run on that VM.


What is GUI VMM?
Do you know what parameters it passes to kvm binary?


--
Tomasz Chmielewski
http://wpkg.org

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: IO on guest is 20 times slower than host

2009-03-31 Thread Kurt Yoder

 On Mar 29, 2009, at 10:29 AM, Avi Kivity wrote:

 Kurt Yoder wrote:

snip


 Can you loading kvm_amd on this host with 'modprobe kvm-amd npt=0'?

 So that's most likely the problem for me:

 m...@host:/etc/nagios/nrpe_directives$ sudo modprobe kvm-amd npt=0
 FATAL: Error inserting kvm_amd (/lib/modules/2.6.27-11-server/kernel/
 arch/x86/kvm/kvm-amd.ko): Operation not supported
 m...@host:/etc/nagios/nrpe_directives$ uname -a
 Linux boron 2.6.27-11-server #1 SMP Thu Jan 29 20:13:12 UTC 2009
 x86_64 GNU/Linux


 It looks like I need to enable SVM in my BIOS. I'll do that and report
 back on the results.




The AMD virtualization option was disabled in my BIOS. Once I enabled it,
all my problems disappeared:

m...@guest:~$ sudo hdparm -t /dev/sda

/dev/sda:
 Timing buffered disk reads:  846 MB in  3.00 seconds = 281.73 MB/sec
m...@guest:~$ sudo dd if=/dev/zero of=/tmp/bigfile count=100
100+0 records in
100+0 records out
51200 bytes (512 MB) copied, 3.84358 s, 133 MB/s


I do recall seeing a warning at boot about an error loading kvm-amd, but
it was not onscreen for long. It was something like error loading module
kvm-amd; looking back now, I see in the syslog kvm: disabled by bios.
Perhaps a warning about slow IO should be issued/logged on AMD hosts every
time KVM is brought up without the benefit of the kvm-amd kernel module?
Anyway, I'm happy now.



Thanks Avi for your help.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] add ksm kernel shared memory driver.

2009-03-31 Thread Andrea Arcangeli
On Tue, Mar 31, 2009 at 10:09:24AM -0500, Anthony Liguori wrote:
 I don't think the registering of ram should be done via sysfs.  That would 
 be a pretty bad interface IMHO.  But I do think the functionality that 
 ksmctl provides along with the security issues I mentioned earlier really 
 suggest that there ought to be a separate API for control vs. registration 
 and that control API would make a lot of sense as a sysfs API.

 If you wanted to explore alternative APIs for registration, madvise() seems 
 like the obvious candidate to me.

 madvise(start, size, MADV_SHARABLE) seems like a pretty obvious API to me.

madvise to me would sound appropriate, only if ksm would be always-in,
which is not the case as it won't even be built if it's configured to
N.

Besides madvise is sus covered syscall, and this is linux specific detail.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Gerry Reno

Tomasz Chmielewski wrote:

Gerry Reno schrieb:

Tomasz Chmielewski wrote:

Gerry Reno schrieb:
Today we upgraded one of our VM's from F9 to F10 and after the 
first reboot we see the dreaded GRUB prompt.  This it turns out is 
a known problem with F10 installs.  And the recovery is usually 
very simple.  You boot into rescue mode from CDROM and reinstall 
the boot loader.  The problem we're seeing is that even though I 
select CDROM from the boot menu, it will never boot from the 
CDROM.  It always has an error.


What error?


Boot Failure Code:  0003
Boot from CDROM failed:  cannot read the boot disk.
FATAL: No bootable device.

The host has only a DVD drive and we are using the DVD F10 install disk.



What can we do to get this VM to boot from the CDROM drive so that 
we can install a  new bootloader and recover this VM?


What parameters do you use to start the guest?



I'm using the GUI VMM and selecting Run on that VM.


What is GUI VMM?

virt-manager


Do you know what parameters it passes to kvm binary?



Here's the XML dump for that VM:
domain type='kvm' id='8'
 nameMX_3/name
 uuid5f478a4f-86e6-82b5-b53b-acff9d9f1f23/uuid
 memory524288/memory
 currentMemory524288/currentMemory
 vcpu2/vcpu
 os
   type arch='i686' machine='pc'hvm/type
   boot dev='hd'/
 /os
 features
   acpi/
 /features
 clock offset='utc'/
 on_poweroffdestroy/on_poweroff
 on_rebootrestart/on_reboot
 on_crashdestroy/on_crash
 devices
   emulator/usr/bin/qemu-kvm/emulator
   disk type='file' device='disk'
 source file='/var/vm/vm1/qemu/images/MX_3/MX_3.img'/
 target dev='hda' bus='ide'/
   /disk
   interface type='bridge'
 mac address='00:0c:29:e3:bc:ee'/
 source bridge='br0'/
 target dev='vnet1'/
   /interface
   input type='mouse' bus='ps2'/
   graphics type='vnc' port='5901' autoport='yes' listen='127.0.0.1' 
keymap='en-us'/

 /devices
/domain

Regards,
Gerry

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 4/4] Fix task switching.

2009-03-31 Thread Kohl, Bernhard (NSN - DE/Munich)
Bernhard Kohl wrote:
 
 Jan Kiszka wrote:
  
  Gleb Natapov wrote:
   The patch fixes two problems with task switching.
   1. Back link is written to a wrong TSS.
   2. Instruction emulation is not needed if the reason for 
 task switch
  is a task gate in IDT and access to it is caused by an 
  external even.
   
   2 is currently solved only for VMX since there is not 
  reliable way to
   skip an instruction in SVM. We should emulate it instead.
  
  Does this series fix all issues Bernhard, Thomas and Julian 
  stumbled over?
  
  Jan
 
 I will try this today. Thanks.
 
Yes, it works for us (Thomas + Bernhard).

Bernhard
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] Fix task switching.

2009-03-31 Thread Gleb Natapov
On Tue, Mar 31, 2009 at 05:21:16PM +0200, Kohl, Bernhard (NSN - DE/Munich) 
wrote:
 Bernhard Kohl wrote:
  
  Jan Kiszka wrote:
   
   Gleb Natapov wrote:
The patch fixes two problems with task switching.
1. Back link is written to a wrong TSS.
2. Instruction emulation is not needed if the reason for 
  task switch
   is a task gate in IDT and access to it is caused by an 
   external even.

2 is currently solved only for VMX since there is not 
   reliable way to
skip an instruction in SVM. We should emulate it instead.
   
   Does this series fix all issues Bernhard, Thomas and Julian 
   stumbled over?
   
   Jan
  
  I will try this today. Thanks.
  
 Yes, it works for us (Thomas + Bernhard).
 
Great. Thanks for testing.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Charles Duffy

Gerry Reno wrote:

What is GUI VMM?

virt-manager


The libvirt mailing list is the right place to get support for 
virt-manager. To get support for kvm proper here, you'll want to use ps 
to see the command line with which libvirtd is invoking kvm and provide 
that.


That said, the XML you copied contains no CD-ROM device; you'll want to 
fix that before proceeding.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] add ksm kernel shared memory driver.

2009-03-31 Thread Anthony Liguori

Andrea Arcangeli wrote:

On Tue, Mar 31, 2009 at 10:09:24AM -0500, Anthony Liguori wrote:
  
I don't think the registering of ram should be done via sysfs.  That would 
be a pretty bad interface IMHO.  But I do think the functionality that 
ksmctl provides along with the security issues I mentioned earlier really 
suggest that there ought to be a separate API for control vs. registration 
and that control API would make a lot of sense as a sysfs API.


If you wanted to explore alternative APIs for registration, madvise() seems 
like the obvious candidate to me.


madvise(start, size, MADV_SHARABLE) seems like a pretty obvious API to me.



madvise to me would sound appropriate, only if ksm would be always-in,
which is not the case as it won't even be built if it's configured to
N.
  


You can still disable ksm and simply return ENOSYS for the MADV_ flag.  
You could even keep it as a module if you liked by separating the 
madvise bits from the ksm bits.  The madvise() bits could just provide 
the tracking infrastructure for determine which vmas were currently 
marked as sharable.


You could then have ksm as loadable module that consumed that interface 
to then perform scanning.



Besides madvise is sus covered syscall, and this is linux specific detail.
  


A number of MADV_ flags are Linux specific (like MADV_DOFORK/MADV_DONTFORK).

Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Gerry Reno

Charles Duffy wrote:

Gerry Reno wrote:

What is GUI VMM?

virt-manager


To get support for kvm proper here, you'll want to use ps to see the 
command line with which libvirtd is invoking kvm and provide that.


/usr/bin/qemu-kvm -S -M pc -m 512 -smp 2 -name MX_3 -monitor pty -boot c 
-drive file=/var/vm/vm1/qemu/images/MX_3/MX_3.img,if=ide,index=0,boot=on 
-net nic,macaddr=00:0c:29:e3:bc:ee,vlan=0 -net 
tap,fd=18,script=,vlan=0,ifname=vnet1 -serial none -parallel none -usb 
-vnc 127.0.0.1:1 -k en-us


What should I add to this line to get cdrom working?


Regards,
Gerry

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Charles Duffy

Gerry Reno wrote:

Charles Duffy wrote:

Gerry Reno wrote:

What is GUI VMM?

virt-manager


To get support for kvm proper here, you'll want to use ps to see the 
command line with which libvirtd is invoking kvm and provide that.


/usr/bin/qemu-kvm -S -M pc -m 512 -smp 2 -name MX_3 -monitor pty -boot c 
-drive file=/var/vm/vm1/qemu/images/MX_3/MX_3.img,if=ide,index=0,boot=on 
-net nic,macaddr=00:0c:29:e3:bc:ee,vlan=0 -net 
tap,fd=18,script=,vlan=0,ifname=vnet1 -serial none -parallel none -usb 
-vnc 127.0.0.1:1 -k en-us


-drive file=/path/to/your.iso,if=ide,index=1,media=cdrom

or, in your XML:

disk type='file' device='cdrom'
  source file='/path/to/your.iso'/
  target dev='hdd' bus='ide'/
  readonly/
/disk

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: mm_pages_next() question

2009-03-31 Thread Marcelo Tosatti
On Sun, Mar 29, 2009 at 03:24:08PM +0300, Avi Kivity wrote:
 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
   struct mmu_page_path *parents,
   int i)
 {
 int n;

 for (n = i+1; n  pvec-nr; n++) {
 struct kvm_mmu_page *sp = pvec-page[n].sp;

 if (sp-role.level == PT_PAGE_TABLE_LEVEL) {
 parents-idx[0] = pvec-page[n].idx;
 return n;
 }

 parents-parent[sp-role.level-2] = sp;
 parents-idx[sp-role.level-1] = pvec-page[n].idx;
 }

 return n;
 }

 Do we need to break out of the loop if we switch parents during the loop  
 (since that will give us a different mmu_page_path)?  Or are callers  
 careful to only pass pvecs which belong to the same shadow page?

This function builds mmu_page_path for a number of pagetable (leaf)
pages. Whenever the path changes, mmu_page_path will be rebuilt.

The pages in the pvec must be organized as follows:

level4, level3, level2, level1, level1, level1, , level3, level2,
level1, level1, ...

So you don't have to repeat higher levels for a number of leaf pages.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] add ksm kernel shared memory driver.

2009-03-31 Thread Andrea Arcangeli
On Tue, Mar 31, 2009 at 10:54:57AM -0500, Anthony Liguori wrote:
 You can still disable ksm and simply return ENOSYS for the MADV_ flag.  You 

-EINVAL if something, -ENOSYS would tell userland that it shall stop
trying to use madvise, including the other MADV_ too.

 could even keep it as a module if you liked by separating the madvise bits 
 from the ksm bits.  The madvise() bits could just provide the tracking 
 infrastructure for determine which vmas were currently marked as sharable.
 You could then have ksm as loadable module that consumed that interface to 
 then perform scanning.

What's the point of making ksm a module if one has part of ksm code
loaded in the kernel and not being possible to avoid compiling in?
People that says KSM=N in their .config (like embedded running with 1M
of ram), don't want that tracking overhead compiled into the kernel.

Returning -EINVAL would be an option but again I think madvise is core
syscall for SuS and I don't like that those core VM parts returns
-EINVAL at will depend on certain kernel modules being loaded.

 A number of MADV_ flags are Linux specific (like 
 MADV_DOFORK/MADV_DONTFORK).

But those aren't kernel module related, so they're in line with the
standard ones and could be adapted by other OS.

KSM is not a core VM functionality, madvise is a core VM
functionality, so I don't see fit. KSM as ioctl or KSM creating
/proc/pid/ksm when loaded, sounds fine to me instead. If open of
either one fails, application won't register in. It's up to you to
choose KSM=M/N, if you want it as core functionality just build as
KSM=Y but leave the option to others to save memory.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] add ksm kernel shared memory driver.

2009-03-31 Thread Anthony Liguori

Andrea Arcangeli wrote:

On Tue, Mar 31, 2009 at 10:54:57AM -0500, Anthony Liguori wrote:
  
You can still disable ksm and simply return ENOSYS for the MADV_ flag.  You 



-EINVAL if something, -ENOSYS would tell userland that it shall stop
trying to use madvise, including the other MADV_ too.

  
could even keep it as a module if you liked by separating the madvise bits 
from the ksm bits.  The madvise() bits could just provide the tracking 
infrastructure for determine which vmas were currently marked as sharable.
You could then have ksm as loadable module that consumed that interface to 
then perform scanning.



What's the point of making ksm a module if one has part of ksm code
loaded in the kernel and not being possible to avoid compiling in?
People that says KSM=N in their .config (like embedded running with 1M
of ram), don't want that tracking overhead compiled into the kernel.
  


You have two things here.  CONFIG_MEM_SHARABLE and CONFIG_KSM.  
CONFIG_MEM_SHARABLE cannot be a module. If it's set to =n, then 
madvise(MADV_SHARABLE) == -ENOSYS.


If CONFIG_MEM_SHARABLE=y, then madvise(MADV_SHARABLE) will keep track of 
all sharable memory regions.  Independently of that, CONFIG_KSM can be 
set to n,m,y.  It depends on CONFIG_MEM_SHARABLE and when it's loaded, 
it consumes the list of sharable vmas.


But honestly, CONFIG_MEM_SHARABLE shouldn't a lot of code so I don't see 
why you'd even need to make it configable.


A number of MADV_ flags are Linux specific (like 
MADV_DOFORK/MADV_DONTFORK).



But those aren't kernel module related, so they're in line with the
standard ones and could be adapted by other OS.

KSM is not a core VM functionality, madvise is a core VM
functionality, so I don't see fit. KSM as ioctl or KSM creating
/proc/pid/ksm when loaded, sounds fine to me instead. If open of
either one fails, application won't register in. It's up to you to
choose KSM=M/N, if you want it as core functionality just build as
KSM=Y but leave the option to others to save memory.
  


The ioctl() interface is quite bad for what you're doing.  You're 
telling the kernel extra information about a VA range in userspace.  
That's what madvise is for.  You're tweaking simple read/write values of 
kernel infrastructure.  That's what sysfs is for.


Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Gerry Reno

Charles Duffy wrote:

Gerry Reno wrote:

Charles Duffy wrote:

Gerry Reno wrote:

What is GUI VMM?

virt-manager


To get support for kvm proper here, you'll want to use ps to see the 
command line with which libvirtd is invoking kvm and provide that.


/usr/bin/qemu-kvm -S -M pc -m 512 -smp 2 -name MX_3 -monitor pty 
-boot c -drive 
file=/var/vm/vm1/qemu/images/MX_3/MX_3.img,if=ide,index=0,boot=on 
-net nic,macaddr=00:0c:29:e3:bc:ee,vlan=0 -net 
tap,fd=18,script=,vlan=0,ifname=vnet1 -serial none -parallel none 
-usb -vnc 127.0.0.1:1 -k en-us


-drive file=/path/to/your.iso,if=ide,index=1,media=cdrom

or, in your XML:

disk type='file' device='cdrom'
  source file='/path/to/your.iso'/
  target dev='hdd' bus='ide'/
  readonly/
/disk

I put the xml stanza in the file and undefine/define domain but it gives 
an error about cannot read image file.

source file=/media/Fedora 10 DVD/
And I check this path and I can read all the files from the command line 
on the DVD just fine.

What could be the problem?

Regards,
Gerry

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Javier Guerra
On Tue, Mar 31, 2009 at 12:01 PM, Gerry Reno gr...@verizon.net wrote:
 Charles Duffy wrote:
 I put the xml stanza in the file and undefine/define domain but it gives an
 error about cannot read image file.
 source file=/media/Fedora 10 DVD/
 And I check this path and I can read all the files from the command line on
 the DVD just fine.
 What could be the problem?

don't put a mount dir, either use a ISO image, or the cdrom device file


-- 
Javier
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Charles Duffy

Gerry Reno wrote:

source file=/media/Fedora 10 DVD/


Well, you're missing a set of opening quotation marks here... and your 
file doesn't have a .iso extension? That's rather unusual.


Anyhow, inasmuch as you're unable to run a define successfully, this is 
a libvirt usage problem, not a kvm issue; I suggest you try irc.oftc.net 
#virt or gmane.comp.emulators.libvirt for support.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Tomasz Chmielewski

Gerry Reno schrieb:


disk type='file' device='cdrom'
  source file='/path/to/your.iso'/
  target dev='hdd' bus='ide'/
  readonly/
/disk

I put the xml stanza in the file and undefine/define domain but it gives 
an error about cannot read image file.

source file=/media/Fedora 10 DVD/
And I check this path and I can read all the files from the command line 
on the DVD just fine.

What could be the problem?


/some/where/fedora.iso

_not_ a mounted directory!

--
Tomasz Chmielewski
http://wpkg.org
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] add ksm kernel shared memory driver.

2009-03-31 Thread Andrea Arcangeli
On Tue, Mar 31, 2009 at 11:51:14AM -0500, Anthony Liguori wrote:
 You have two things here.  CONFIG_MEM_SHARABLE and CONFIG_KSM.  
 CONFIG_MEM_SHARABLE cannot be a module. If it's set to =n, then 
 madvise(MADV_SHARABLE) == -ENOSYS.

Where the part that -ENOSYS tell userland madvise syscall table is
empty, which is obviously not the case, wasn't clear?

 If CONFIG_MEM_SHARABLE=y, then madvise(MADV_SHARABLE) will keep track of 
 all sharable memory regions.  Independently of that, CONFIG_KSM can be set 
 to n,m,y.  It depends on CONFIG_MEM_SHARABLE and when it's loaded, it 
 consumes the list of sharable vmas.

And what do you gain by creating two config params when only one is
needed other than more pain for the poor user doing make oldconfig and
being asked new zillon of questions that aren't necessary?

 But honestly, CONFIG_MEM_SHARABLE shouldn't a lot of code so I don't see 
 why you'd even need to make it configable.

Even if you were to move the registration code in madvise with a
-EINVAL retval if KSM was set to N for embedded, CONFIG_KSM would be
enough: the registration code would be surrounded by CONFIG_KSM_MODULE
|| CONFIG_KSM, just like page_wrprotect/replace_page. This
CONFIG_MEM_SHARABLE in addition to CONFIG_KSM is beyond what can make
sense to me.

 The ioctl() interface is quite bad for what you're doing.  You're telling 
 the kernel extra information about a VA range in userspace.  That's what 

The ioctl can be extended to also tell which pid to share without
having to specify VA range, and having the feature inherited by the
child. Not everyone wants to deal with VA.

But my main issue with madvise is that it's core kernel functionality
while KSM clearly is not.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Gerry Reno

Javier Guerra wrote:

On Tue, Mar 31, 2009 at 12:01 PM, Gerry Reno gr...@verizon.net wrote:
  

Charles Duffy wrote:
I put the xml stanza in the file and undefine/define domain but it gives an
error about cannot read image file.
source file=/media/Fedora 10 DVD/
And I check this path and I can read all the files from the command line on
the DVD just fine.
What could be the problem?



don't put a mount dir, either use a ISO image, or the cdrom device file


  
Ok, a little closer now.  I put this in xml file and redefine domain and 
it now defines:


source file=/dev/sr0/
This was device that mount showed as mounting the DVD.

But when the domain boots and I select 3. CDROM from screen, it still shows the 
original boot error:
Boot Failure Code:  0003
Boot from CDROM failed:  cannot read the boot disk.
FATAL: No bootable device. 


What should I change?

Regards,
Gerry



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Gerry Reno

Gerry Reno wrote:

Javier Guerra wrote:

On Tue, Mar 31, 2009 at 12:01 PM, Gerry Reno gr...@verizon.net wrote:
 

Charles Duffy wrote:
I put the xml stanza in the file and undefine/define domain but it 
gives an

error about cannot read image file.
source file=/media/Fedora 10 DVD/
And I check this path and I can read all the files from the command 
line on

the DVD just fine.
What could be the problem?



don't put a mount dir, either use a ISO image, or the cdrom device file


  
Ok, a little closer now.  I put this in xml file and redefine domain 
and it now defines:


source file=/dev/sr0/
This was device that mount showed as mounting the DVD.

But when the domain boots and I select 3. CDROM from screen, it still 
shows the original boot error:

Boot Failure Code:  0003
Boot from CDROM failed:  cannot read the boot disk.
FATAL: No bootable device.
What should I change?


Here is what command looks like now using 'ps':
/usr/bin/qemu-kvm -S -M pc -m 512 -smp 2 -name MX_3 -monitor pty -boot c 
-drive file=/var/vm/vm1/qemu/images/MX_3/MX_3.img,if=ide,index=0,boot=on 
-drive file=/dev/sr0,if=ide,media=cdrom,index=3 -net 
nic,macaddr=00:0c:29:e3:bc:ee,vlan=0 -net 
tap,fd=17,script=,vlan=0,ifname=vnet1 -serial none -parallel none -usb 
-vnc 127.0.0.1:1 -k en-us


Regards,
Gerry

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Can't download kvmctl scripts

2009-03-31 Thread Brent A Nelson

URL: http://www.linux-kvm.org/page/HowToConfigScript

The kvmctl scripts in the HowTo pages can't be downloaded, as the download 
links are actually uploads.


Thanks,

Brent Nelson
Director of Computing
Dept. of Physics
University of Florida
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Gerry Reno

Gerry Reno wrote:

Gerry Reno wrote:

Javier Guerra wrote:

On Tue, Mar 31, 2009 at 12:01 PM, Gerry Reno gr...@verizon.net wrote:
 

Charles Duffy wrote:
I put the xml stanza in the file and undefine/define domain but it 
gives an

error about cannot read image file.
source file=/media/Fedora 10 DVD/
And I check this path and I can read all the files from the command 
line on

the DVD just fine.
What could be the problem?



don't put a mount dir, either use a ISO image, or the cdrom device file


  
Ok, a little closer now.  I put this in xml file and redefine domain 
and it now defines:


source file=/dev/sr0/
This was device that mount showed as mounting the DVD.

But when the domain boots and I select 3. CDROM from screen, it still 
shows the original boot error:

Boot Failure Code:  0003
Boot from CDROM failed:  cannot read the boot disk.
FATAL: No bootable device.
What should I change?


Here is what command looks like now using 'ps':
/usr/bin/qemu-kvm -S -M pc -m 512 -smp 2 -name MX_3 -monitor pty -boot 
c -drive 
file=/var/vm/vm1/qemu/images/MX_3/MX_3.img,if=ide,index=0,boot=on 
-drive file=/dev/sr0,if=ide,media=cdrom,index=3 -net 
nic,macaddr=00:0c:29:e3:bc:ee,vlan=0 -net 
tap,fd=17,script=,vlan=0,ifname=vnet1 -serial none -parallel none -usb 
-vnc 127.0.0.1:1 -k en-us



And I try other disk type:
disk type='block' device='cdrom'
But that produces the same error.

What else can I add in order to boot from cdrom?


Regards,
Gerry

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Tomasz Chmielewski

Gerry Reno schrieb:

Gerry Reno wrote:

Gerry Reno wrote:

Javier Guerra wrote:

On Tue, Mar 31, 2009 at 12:01 PM, Gerry Reno gr...@verizon.net wrote:
 

Charles Duffy wrote:
I put the xml stanza in the file and undefine/define domain but it 
gives an

error about cannot read image file.
source file=/media/Fedora 10 DVD/
And I check this path and I can read all the files from the command 
line on

the DVD just fine.
What could be the problem?



don't put a mount dir, either use a ISO image, or the cdrom device file


  
Ok, a little closer now.  I put this in xml file and redefine domain 
and it now defines:


source file=/dev/sr0/
This was device that mount showed as mounting the DVD.

But when the domain boots and I select 3. CDROM from screen, it still 
shows the original boot error:

Boot Failure Code:  0003
Boot from CDROM failed:  cannot read the boot disk.
FATAL: No bootable device.
What should I change?


Here is what command looks like now using 'ps':
/usr/bin/qemu-kvm -S -M pc -m 512 -smp 2 -name MX_3 -monitor pty -boot 
c -drive 
file=/var/vm/vm1/qemu/images/MX_3/MX_3.img,if=ide,index=0,boot=on 
-drive file=/dev/sr0,if=ide,media=cdrom,index=3 -net 
nic,macaddr=00:0c:29:e3:bc:ee,vlan=0 -net 
tap,fd=17,script=,vlan=0,ifname=vnet1 -serial none -parallel none -usb 
-vnc 127.0.0.1:1 -k en-us



And I try other disk type:
disk type='block' device='cdrom'
But that produces the same error.

What else can I add in order to boot from cdrom?


What does:

md5sum /dev/sr0

output?


--
Tomasz Chmielewski
http://wpkg.org

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Gerry Reno

Tomasz Chmielewski wrote:

Gerry Reno schrieb:

Gerry Reno wrote:

Gerry Reno wrote:

Javier Guerra wrote:
On Tue, Mar 31, 2009 at 12:01 PM, Gerry Reno gr...@verizon.net 
wrote:
 

Charles Duffy wrote:
I put the xml stanza in the file and undefine/define domain but 
it gives an

error about cannot read image file.
source file=/media/Fedora 10 DVD/
And I check this path and I can read all the files from the 
command line on

the DVD just fine.
What could be the problem?



don't put a mount dir, either use a ISO image, or the cdrom device 
file



  
Ok, a little closer now.  I put this in xml file and redefine 
domain and it now defines:


source file=/dev/sr0/
This was device that mount showed as mounting the DVD.

But when the domain boots and I select 3. CDROM from screen, it 
still shows the original boot error:

Boot Failure Code:  0003
Boot from CDROM failed:  cannot read the boot disk.
FATAL: No bootable device.
What should I change?


Here is what command looks like now using 'ps':
/usr/bin/qemu-kvm -S -M pc -m 512 -smp 2 -name MX_3 -monitor pty 
-boot c -drive 
file=/var/vm/vm1/qemu/images/MX_3/MX_3.img,if=ide,index=0,boot=on 
-drive file=/dev/sr0,if=ide,media=cdrom,index=3 -net 
nic,macaddr=00:0c:29:e3:bc:ee,vlan=0 -net 
tap,fd=17,script=,vlan=0,ifname=vnet1 -serial none -parallel none 
-usb -vnc 127.0.0.1:1 -k en-us



And I try other disk type:
disk type='block' device='cdrom'
But that produces the same error.

What else can I add in order to boot from cdrom?


What does:

md5sum /dev/sr0

output?



DVD is Fedora 10 DVD (i386)

Four cases:

# desktop user; DVD unmounted
$ md5sum /dev/sr0
md5sum: /dev/sr0: Input/output error

# desktop user; DVD mounted
$ md5sum /dev/sr0
ff311b322c894aabc4361c4e270f5a3f  /dev/sr0

# root user; DVD unmounted
$ md5sum /dev/sr0
md5sum: /dev/sr0: Input/output error

# root user; DVD mounted
$ md5sum /dev/sr0
md5sum: /dev/sr0: Input/output error

# 'ps' shows process is running as root:
root  7311  2927 99 13:08 ?00:20:01 /usr/bin/qemu-kvm -S -M 
pc -m 512 -smp 2 -name MX_3 -monitor pty -boot c -drive 
file=/var/vm/vm1/qemu/images/MX_3/MX_3.img,if=ide,index=0,boot=on -drive 
file=/dev/sr0,if=ide,media=cdrom,index=3 -net 
nic,macaddr=00:0c:29:e3:bc:ee,vlan=0 -net 
tap,fd=17,script=,vlan=0,ifname=vnet1 -serial none -parallel none -usb 
-vnc 127.0.0.1:1 -k en-us



Regards,
Gerry

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Tomasz Chmielewski

Gerry Reno schrieb:


What does:

md5sum /dev/sr0

output?



DVD is Fedora 10 DVD (i386)

Four cases:

# desktop user; DVD unmounted
$ md5sum /dev/sr0
md5sum: /dev/sr0: Input/output error

# desktop user; DVD mounted
$ md5sum /dev/sr0
ff311b322c894aabc4361c4e270f5a3f  /dev/sr0


Download the iso file to your disk and point kvm there.

It's the easiest to do; your problem is not really kvm-specific.


--
Tomasz Chmielewski
http://wpkg.org


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Gerry Reno

Tomasz Chmielewski wrote:

Gerry Reno schrieb:


What does:

md5sum /dev/sr0

output?



DVD is Fedora 10 DVD (i386)

Four cases:

# desktop user; DVD unmounted
$ md5sum /dev/sr0
md5sum: /dev/sr0: Input/output error

# desktop user; DVD mounted
$ md5sum /dev/sr0
ff311b322c894aabc4361c4e270f5a3f  /dev/sr0


Download the iso file to your disk and point kvm there.

It's the easiest to do; your problem is not really kvm-specific.


I'll try that but it seems as though the process being owned by root is 
preventing the access to the cdrom.  So isn't that kvm?  Does libvirt 
know this?  I mean never once have we been able to use the cdrom from 
the VM.  Not just for this problem.  This seems to be some kind of 
access problem.


Regards,
Gerry





--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 00/17] virtual-bus

2009-03-31 Thread Gregory Haskins
applies to v2.6.29 (will port to git HEAD soon)

FIRST OFF: Let me state that this is not a KVM or networking specific
technology.  Virtual-Bus is a mechanism for defining and deploying
software “devices” directly in a Linux kernel.  The example use-case we
have provided supports a “virtual-ethernet” device being utilized in a
KVM guest environment, so comparisons to virtio-net will be natural.
However, please note that this is but one use-case, of many we have
planned for the future (such as userspace bypass and RT guest support).
The goal for right now is to describe what a virual-bus is and why we
believe it is useful.

We are intent to get this core technology merged, even if the networking
components are not accepted as is.  It should be noted that, in many ways,
virtio could be considered complimentary to the technology.  We could
in fact, have implemented the virtual-ethernet using a virtio-ring, but
it would have required ABI changes that we didn't want to yet propose
without having the concept in general vetted and accepted by the community.

To cut to the chase, we recently measured our virtual-ethernet on 
v2.6.29 on two 8-core x86_64 boxes with Chelsio T3 10GE connected back
to back via cross over.  We measured bare-metal performance, as well
as a kvm guest (running the same kernel) connected to the T3 via
a linux-bridge+tap configuration with a 1500 MTU.  The results are as
follows:

Bare metal: tput = 4078Mb/s, round-trip = 25593pps (39us rtt)
Virtio-net: tput = 4003Mb/s, round-trip = 320pps (3125us rtt)
Venet: tput = 4050Mb/s, round-trip = 15255 (65us rtt)

As you can see, all three technologies can achieve (MTU limited) line-rate,
but the virtio-net solution is severely limited on the latency front (by a
factor of 48:1)

Note that the 320pps is technically artificially low in virtio-net, caused by a
a known design limitation to use a timer for tx-mitigation.  However, note that
even when removing the timer from the path the best we could achieve was
350us-450us of latency, and doing so causes the tput to drop to 1300Mb/s.
So even in this case, I think the in-kernel results presents a compelling
argument for the new model presented.

When we jump to 9000 byte MTU, the situation looks similar

Bare metal: tput = 9717Mb/s, round-trip = 30396pps (33us rtt)
Virtio-net: tput = 4578Mb/s, round-trip = 249pps (4016us rtt)
Venet: tput = 5802Mb/s, round-trip = 15127 (66us rtt)


Note that even the throughput was slightly better in this test for venet, though
neither venet nor virtio-net could achieve line-rate.  I suspect some tuning may
allow these numbers to improve, TBD.

So with that said, lets jump into the description:

Virtual-Bus: What is it?


Virtual-Bus is a kernel based IO resource container technology.  It is modeled
on a concept similar to the Linux Device-Model (LDM), where we have buses,
devices, and drivers as the primary actors.  However, VBUS has several
distinctions when contrasted with LDM:

  1) Busses in LDM are relatively static and global to the kernel (e.g.
 PCI, USB, etc).  VBUS buses are arbitrarily created and destroyed
 dynamically, and are not globally visible.  Instead they are defined as
 visible only to a specific subset of the system (the contained context).
  2) Devices in LDM are typically tangible physical (or sometimes logical)
 devices.  VBUS devices are purely software abstractions (which may or
 may not have one or more physical devices behind them).  Devices may
 also be arbitrarily created or destroyed by software/administrative action
 as opposed to by a hardware discovery mechanism.
  3) Drivers in LDM sit within the same kernel context as the busses and
 devices they interact with.  VBUS drivers live in a foreign
 context (such as userspace, or a virtual-machine guest).

The idea is that a vbus is created to contain access to some IO services.
Virtual devices are then instantiated and linked to a bus to grant access to
drivers actively present on the bus.  Drivers will only have visibility to
devices present on their respective bus, and nothing else.

Virtual devices are defined by modules which register a deviceclass with the
system.  A deviceclass simply represents a type of device that _may_ be
instantiated into a device, should an administrator wish to do so.  Once
this has happened, the device may be associated with one or more buses where
it will become visible to all clients of those respective buses.

Why do we need this?
--

There are various reasons why such a construct may be useful.  One of the
most interesting use cases is for virtualization, such as KVM.  Hypervisors
today provide virtualized IO resources to a guest, but this is often at a cost
in both latency and throughput compared to bare metal performance.  Utilizing
para-virtual resources instead of emulated devices helps to mitigate this
penalty, but even these techniques to date have not fully realized the

[RFC PATCH 01/17] shm-signal: shared-memory signals

2009-03-31 Thread Gregory Haskins
This interface provides a bidirectional shared-memory based signaling
mechanism.  It can be used by any entities which desire efficient
communication via shared memory.  The implementation details of the
signaling are abstracted so that they may transcend a wide variety
of locale boundaries (e.g. userspace/kernel, guest/host, etc).

The shm_signal mechanism supports event masking as well as spurious
event delivery mitigation.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 include/linux/shm_signal.h |  188 
 lib/Kconfig|   10 ++
 lib/Makefile   |1 
 lib/shm_signal.c   |  186 
 4 files changed, 385 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/shm_signal.h
 create mode 100644 lib/shm_signal.c

diff --git a/include/linux/shm_signal.h b/include/linux/shm_signal.h
new file mode 100644
index 000..a65e54e
--- /dev/null
+++ b/include/linux/shm_signal.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * Author:
+ *  Gregory Haskins ghask...@novell.com
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef _LINUX_SHM_SIGNAL_H
+#define _LINUX_SHM_SIGNAL_H
+
+#include asm/types.h
+
+/*
+ *-
+ * The following structures represent data that is shared across boundaries
+ * which may be quite disparate from one another (e.g. Windows vs Linux,
+ * 32 vs 64 bit, etc).  Therefore, care has been taken to make sure they
+ * present data in a manner that is independent of the environment.
+ *---
+ */
+
+#define SHM_SIGNAL_MAGIC 0x58fa39df
+#define SHM_SIGNAL_VER   1
+
+struct shm_signal_irq {
+   __u8  enabled;
+   __u8  pending;
+   __u8  dirty;
+};
+
+enum shm_signal_locality {
+   shm_locality_north,
+   shm_locality_south,
+};
+
+struct shm_signal_desc {
+   __u32 magic;
+   __u32 ver;
+   struct shm_signal_irq irq[2];
+};
+
+/* --- END SHARED STRUCTURES --- */
+
+#ifdef __KERNEL__
+
+#include linux/interrupt.h
+
+struct shm_signal_notifier {
+   void (*signal)(struct shm_signal_notifier *);
+};
+
+struct shm_signal;
+
+struct shm_signal_ops {
+   int  (*inject)(struct shm_signal *s);
+   void (*fault)(struct shm_signal *s, const char *fmt, ...);
+   void (*release)(struct shm_signal *s);
+};
+
+enum {
+   shm_signal_in_wakeup,
+};
+
+struct shm_signal {
+   atomic_trefs;
+   spinlock_t  lock;
+   enum shm_signal_localitylocale;
+   unsigned long   flags;
+   struct shm_signal_ops  *ops;
+   struct shm_signal_desc *desc;
+   struct shm_signal_notifier *notifier;
+   struct tasklet_struct   deferred_notify;
+};
+
+#define SHM_SIGNAL_FAULT(s, fmt, args...)  \
+  ((s)-ops-fault ? (s)-ops-fault((s), fmt, ## args) : panic(fmt, ## args))
+
+ /*
+  * These functions should only be used internally
+  */
+void _shm_signal_release(struct shm_signal *s);
+void _shm_signal_wakeup(struct shm_signal *s);
+
+/**
+ * shm_signal_init() - initialize an SHM_SIGNAL
+ * @s:SHM_SIGNAL context
+ *
+ * Initializes SHM_SIGNAL context before first use
+ *
+ **/
+void shm_signal_init(struct shm_signal *s);
+
+/**
+ * shm_signal_get() - acquire an SHM_SIGNAL context reference
+ * @s:SHM_SIGNAL context
+ *
+ **/
+static inline struct shm_signal *shm_signal_get(struct shm_signal *s)
+{
+   atomic_inc(s-refs);
+
+   return s;
+}
+
+/**
+ * shm_signal_put() - release an SHM_SIGNAL context reference
+ * @s:SHM_SIGNAL context
+ *
+ **/
+static inline void shm_signal_put(struct shm_signal *s)
+{
+   if (atomic_dec_and_test(s-refs))
+   _shm_signal_release(s);
+}
+
+/**
+ * shm_signal_enable() - enables local notifications on an SHM_SIGNAL
+ * @s:SHM_SIGNAL context
+ * @flags:  Reserved for future use, must be 0
+ *
+ * Enables/unmasks the registered notifier (if applicable) to receive wakeups
+ * whenever the remote side performs an shm_signal() operation. A notification
+ * will be dispatched immediately if any pending signals have already been
+ * issued prior to invoking this call.
+ *
+ * This is synonymous with unmasking an interrupt.
+ *

[RFC PATCH 03/17] vbus: add connection-client helper infrastructure

2009-03-31 Thread Gregory Haskins
We expect to have various types of connection-clients (e.g. userspace,
kvm, etc), each of which is likely to have common access patterns and
marshalling duties.  Therefore we create a client API to simplify
client development by helping with mundane tasks such as handle-2-pointer
translation, etc.

Special thanks to Pat Mullaney for suggesting the optimization to pass
a cookie object down during DEVICESHM operations to save lookup overhead
on the event channel.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 include/linux/vbus_client.h |  115 +
 kernel/vbus/Makefile|2 
 kernel/vbus/client.c|  527 +++
 3 files changed, 643 insertions(+), 1 deletions(-)
 create mode 100644 include/linux/vbus_client.h
 create mode 100644 kernel/vbus/client.c

diff --git a/include/linux/vbus_client.h b/include/linux/vbus_client.h
new file mode 100644
index 000..62dab78
--- /dev/null
+++ b/include/linux/vbus_client.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * Virtual-Bus - Client interface
+ *
+ * We expect to have various types of connection-clients (e.g. userspace,
+ * kvm, etc).  Each client will be connecting from some environment outside
+ * of the kernel, and therefore will not have direct access to the API as
+ * presented in ./linux/vbus.h.  There will undoubtedly be some parameter
+ * marshalling that must occur, as well as common patterns for the handling
+ * of those marshalled parameters (e.g. translating a handle into a pointer,
+ * etc).
+ *
+ * Therefore this client API is provided to simplify the development
+ * of any clients.  Of course, a client is free to bypass this API entirely
+ * and communicate with the direct VBUS API if desired.
+ *
+ * Author:
+ *  Gregory Haskins ghask...@novell.com
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef _LINUX_VBUS_CLIENT_H
+#define _LINUX_VBUS_CLIENT_H
+
+#include linux/types.h
+#include linux/compiler.h
+
+struct vbus_deviceopen {
+   __u32 devid;
+   __u32 version; /* device ABI version */
+   __u64 handle; /* return value for devh */
+};
+
+struct vbus_devicecall {
+   __u64 devh;   /* device-handle (returned from DEVICEOPEN */
+   __u32 func;
+   __u32 len;
+   __u32 flags;
+   __u64 datap;
+};
+
+struct vbus_deviceshm {
+   __u64 devh;   /* device-handle (returned from DEVICEOPEN */
+   __u32 id;
+   __u32 len;
+   __u32 flags;
+   struct {
+   __u32 offset;
+   __u32 prio;
+   __u64 cookie; /* token to pass back when signaling client */
+   } signal;
+   __u64 datap;
+   __u64 handle; /* return value for signaling from client to kernel */
+};
+
+#ifdef __KERNEL__
+
+#include linux/ioq.h
+#include linux/module.h
+#include asm/atomic.h
+
+struct vbus_client;
+
+struct vbus_client_ops {
+   int (*deviceopen)(struct vbus_client *client,  struct vbus_memctx *ctx,
+ __u32 devid, __u32 version, __u64 *devh);
+   int (*deviceclose)(struct vbus_client *client, __u64 devh);
+   int (*devicecall)(struct vbus_client *client,
+ __u64 devh, __u32 func,
+ void *data, __u32 len, __u32 flags);
+   int (*deviceshm)(struct vbus_client *client,
+__u64 devh, __u32 id,
+struct vbus_shm *shm, struct shm_signal *signal,
+__u32 flags, __u64 *handle);
+   int (*shmsignal)(struct vbus_client *client, __u64 handle);
+   void (*release)(struct vbus_client *client);
+};
+
+struct vbus_client {
+   atomic_t refs;
+   struct vbus_client_ops *ops;
+};
+
+static inline void vbus_client_get(struct vbus_client *client)
+{
+   atomic_inc(client-refs);
+}
+
+static inline void vbus_client_put(struct vbus_client *client)
+{
+   if (atomic_dec_and_test(client-refs))
+   client-ops-release(client);
+}
+
+struct vbus_client *vbus_client_attach(struct vbus *bus);
+
+extern struct vbus_memctx *current_memctx;
+struct vbus_memctx *task_memctx_alloc(struct task_struct *task);
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_VBUS_CLIENT_H */
diff --git a/kernel/vbus/Makefile b/kernel/vbus/Makefile
index 367f65b..4d440e5 100644
--- a/kernel/vbus/Makefile
+++ b/kernel/vbus/Makefile
@@ 

[RFC PATCH 06/17] ioq: Add basic definitions for a shared-memory, lockless queue

2009-03-31 Thread Gregory Haskins
We can map these over VBUS shared memory (or really any shared-memory
architecture if it supports shm-signals) to allow asynchronous
communication between two end-points.  Memory is synchronized using
pure barriers (i.e. lockless), so IOQs are friendly in many contexts,
even if the memory is remote.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 include/linux/ioq.h |  410 +++
 lib/Kconfig |   12 +
 lib/Makefile|1 
 lib/ioq.c   |  298 +
 4 files changed, 721 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/ioq.h
 create mode 100644 lib/ioq.c

diff --git a/include/linux/ioq.h b/include/linux/ioq.h
new file mode 100644
index 000..d450d9a
--- /dev/null
+++ b/include/linux/ioq.h
@@ -0,0 +1,410 @@
+/*
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * IOQ is a generic shared-memory, lockless queue mechanism. It can be used
+ * in a variety of ways, though its intended purpose is to become the
+ * asynchronous communication path for virtual-bus drivers.
+ *
+ * The following are a list of key design points:
+ *
+ * #) All shared-memory is always allocated on explicitly one side of the
+ *link.  This typically would be the guest side in a VM/VMM scenario.
+ * #) Each IOQ has the concept of north and south locales, where
+ *north denotes the memory-owner side (e.g. guest).
+ * #) An IOQ is manipulated using an iterator idiom.
+ * #) Provides a bi-directional signaling/notification infrastructure on
+ *a per-queue basis, which includes an event mitigation strategy
+ *to reduce boundary switching.
+ * #) The signaling path is abstracted so that various technologies and
+ *topologies can define their own specific implementation while sharing
+ *the basic structures and code.
+ *
+ * Author:
+ *  Gregory Haskins ghask...@novell.com
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef _LINUX_IOQ_H
+#define _LINUX_IOQ_H
+
+#include asm/types.h
+#include linux/shm_signal.h
+
+/*
+ *-
+ * The following structures represent data that is shared across boundaries
+ * which may be quite disparate from one another (e.g. Windows vs Linux,
+ * 32 vs 64 bit, etc).  Therefore, care has been taken to make sure they
+ * present data in a manner that is independent of the environment.
+ *---
+ */
+struct ioq_ring_desc {
+   __u64 cookie; /* for arbitrary use by north-side */
+   __u64 ptr;
+   __u64 len;
+   __u8  valid;
+   __u8  sown; /* South owned = 1, North owned = 0 */
+};
+
+#define IOQ_RING_MAGIC 0x47fa2fe4
+#define IOQ_RING_VER   4
+
+struct ioq_ring_idx {
+   __u32 head;/* 0 based index to head of ptr array */
+   __u32 tail;/* 0 based index to tail of ptr array */
+   __u8  full;
+};
+
+enum ioq_locality {
+   ioq_locality_north,
+   ioq_locality_south,
+};
+
+struct ioq_ring_head {
+   __u32  magic;
+   __u32  ver;
+   struct shm_signal_desc signal;
+   struct ioq_ring_idxidx[2];
+   __u32  count;
+   struct ioq_ring_desc   ring[1]; /* count elements will be allocated */
+};
+
+#define IOQ_HEAD_DESC_SIZE(count) \
+(sizeof(struct ioq_ring_head) + sizeof(struct ioq_ring_desc) * (count - 1))
+
+/* --- END SHARED STRUCTURES --- */
+
+#ifdef __KERNEL__
+
+#include linux/sched.h
+#include linux/wait.h
+#include linux/interrupt.h
+#include linux/shm_signal.h
+#include asm/atomic.h
+
+enum ioq_idx_type {
+   ioq_idxtype_valid,
+   ioq_idxtype_inuse,
+   ioq_idxtype_both,
+   ioq_idxtype_invalid,
+};
+
+enum ioq_seek_type {
+   ioq_seek_tail,
+   ioq_seek_next,
+   ioq_seek_head,
+   ioq_seek_set
+};
+
+struct ioq_iterator {
+   struct ioq*ioq;
+   struct ioq_ring_idx   *idx;
+   u32pos;
+   struct ioq_ring_desc  *desc;
+   intupdate:1;
+   intdualidx:1;
+   intflipowner:1;
+};
+
+struct ioq_notifier {
+   void (*signal)(struct ioq_notifier *);
+};
+
+struct ioq_ops {
+   void (*release)(struct ioq *ioq);
+};
+

[RFC PATCH 05/17] vbus: add a vbus-proxy bus model for vbus_driver objects

2009-03-31 Thread Gregory Haskins
This will generally be used for hypervisors to publish any host-side
virtual devices up to a guest.  The guest will have the opportunity
to consume any devices present on the vbus-proxy as if they were
platform devices, similar to existing buses like PCI.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 include/linux/vbus_driver.h |   73 +
 kernel/vbus/Kconfig |9 +++
 kernel/vbus/Makefile|4 +
 kernel/vbus/proxy.c |  152 +++
 4 files changed, 238 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/vbus_driver.h
 create mode 100644 kernel/vbus/proxy.c

diff --git a/include/linux/vbus_driver.h b/include/linux/vbus_driver.h
new file mode 100644
index 000..c53e13f
--- /dev/null
+++ b/include/linux/vbus_driver.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * Mediates access to a host VBUS from a guest kernel by providing a
+ * global view of all VBUS devices
+ *
+ * Author:
+ *  Gregory Haskins ghask...@novell.com
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef _LINUX_VBUS_DRIVER_H
+#define _LINUX_VBUS_DRIVER_H
+
+#include linux/device.h
+#include linux/shm_signal.h
+
+struct vbus_device_proxy;
+struct vbus_driver;
+
+struct vbus_device_proxy_ops {
+   int (*open)(struct vbus_device_proxy *dev, int version, int flags);
+   int (*close)(struct vbus_device_proxy *dev, int flags);
+   int (*shm)(struct vbus_device_proxy *dev, int id, int prio,
+  void *ptr, size_t len,
+  struct shm_signal_desc *sigdesc, struct shm_signal **signal,
+  int flags);
+   int (*call)(struct vbus_device_proxy *dev, u32 func,
+   void *data, size_t len, int flags);
+   void (*release)(struct vbus_device_proxy *dev);
+};
+
+struct vbus_device_proxy {
+   char  *type;
+   u64id;
+   void  *priv; /* Used by drivers */
+   struct vbus_device_proxy_ops  *ops;
+   struct device  dev;
+};
+
+int vbus_device_proxy_register(struct vbus_device_proxy *dev);
+void vbus_device_proxy_unregister(struct vbus_device_proxy *dev);
+
+struct vbus_device_proxy *vbus_device_proxy_find(u64 id);
+
+struct vbus_driver_ops {
+   int (*probe)(struct vbus_device_proxy *dev);
+   int (*remove)(struct vbus_device_proxy *dev);
+};
+
+struct vbus_driver {
+   char  *type;
+   struct module *owner;
+   struct vbus_driver_ops*ops;
+   struct device_driver   drv;
+};
+
+int vbus_driver_register(struct vbus_driver *drv);
+void vbus_driver_unregister(struct vbus_driver *drv);
+
+#endif /* _LINUX_VBUS_DRIVER_H */
diff --git a/kernel/vbus/Kconfig b/kernel/vbus/Kconfig
index f2b92f5..3aaa085 100644
--- a/kernel/vbus/Kconfig
+++ b/kernel/vbus/Kconfig
@@ -12,3 +12,12 @@ config VBUS
various tasks and devices which reside on the bus.
 
If unsure, say N
+
+config VBUS_DRIVERS
+   tristate VBUS Driver support
+   default n
+   help
+Adds support for a virtual bus model for proxying drivers.
+
+   If unsure, say N
+
diff --git a/kernel/vbus/Makefile b/kernel/vbus/Makefile
index 4d440e5..d028ece 100644
--- a/kernel/vbus/Makefile
+++ b/kernel/vbus/Makefile
@@ -1 +1,5 @@
 obj-$(CONFIG_VBUS) += core.o devclass.o config.o attribute.o map.o client.o
+
+vbus-proxy-objs += proxy.o
+obj-$(CONFIG_VBUS_DRIVERS) += vbus-proxy.o
+
diff --git a/kernel/vbus/proxy.c b/kernel/vbus/proxy.c
new file mode 100644
index 000..ea48f00
--- /dev/null
+++ b/kernel/vbus/proxy.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * Author:
+ *  Gregory Haskins ghask...@novell.com
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * 

[RFC PATCH 04/17] vbus: add bus-registration notifiers

2009-03-31 Thread Gregory Haskins
We need to get hotswap events in environments which cannot use existing
facilities (e.g. inotify).  So we add a notifier-chain to allow client
callbacks whenever an interface is {un}registered.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 include/linux/vbus.h |   15 +
 kernel/vbus/core.c   |   59 ++
 kernel/vbus/vbus.h   |1 +
 3 files changed, 75 insertions(+), 0 deletions(-)

diff --git a/include/linux/vbus.h b/include/linux/vbus.h
index 5f0566c..04db4ff 100644
--- a/include/linux/vbus.h
+++ b/include/linux/vbus.h
@@ -29,6 +29,7 @@
 #include linux/sched.h
 #include linux/rcupdate.h
 #include linux/vbus_device.h
+#include linux/notifier.h
 
 struct vbus;
 struct task_struct;
@@ -137,6 +138,20 @@ static inline void task_vbus_disassociate(struct 
task_struct *p)
}
 }
 
+enum {
+   VBUS_EVENT_DEVADD,
+   VBUS_EVENT_DEVDROP,
+};
+
+struct vbus_event_devadd {
+   const char   *type;
+   unsigned long id;
+};
+
+int vbus_notifier_register(struct vbus *vbus, struct notifier_block *nb);
+int vbus_notifier_unregister(struct vbus *vbus, struct notifier_block *nb);
+
+
 #else /* CONFIG_VBUS */
 
 #define fork_vbus(p) do { } while (0)
diff --git a/kernel/vbus/core.c b/kernel/vbus/core.c
index 033999f..b6df487 100644
--- a/kernel/vbus/core.c
+++ b/kernel/vbus/core.c
@@ -89,6 +89,7 @@ int vbus_device_interface_register(struct vbus_device *dev,
 {
int ret;
struct vbus_devshell *ds = to_devshell(dev-kobj);
+   struct vbus_event_devadd ev;
 
mutex_lock(vbus-lock);
 
@@ -124,6 +125,14 @@ int vbus_device_interface_register(struct vbus_device *dev,
if (ret)
goto error;
 
+   ev.type = intf-type;
+   ev.id   = intf-id;
+
+   /* and let any clients know about the new device */
+   ret = raw_notifier_call_chain(vbus-notifier, VBUS_EVENT_DEVADD, ev);
+   if (ret  0)
+   goto error;
+
mutex_unlock(vbus-lock);
 
return 0;
@@ -144,6 +153,7 @@ int vbus_device_interface_unregister(struct 
vbus_device_interface *intf)
 
mutex_lock(vbus-lock);
_interface_unregister(intf);
+   raw_notifier_call_chain(vbus-notifier, VBUS_EVENT_DEVDROP, intf-id);
mutex_unlock(vbus-lock);
 
kobject_put(intf-kobj);
@@ -346,6 +356,8 @@ int vbus_create(const char *name, struct vbus **bus)
 
_bus-next_id = 0;
 
+   RAW_INIT_NOTIFIER_HEAD(_bus-notifier);
+
mutex_lock(vbus_root.lock);
 
ret = map_add(vbus_root.buses.map, _bus-node);
@@ -358,6 +370,53 @@ int vbus_create(const char *name, struct vbus **bus)
return 0;
 }
 
+#define for_each_rbnode(node, root) \
+   for (node = rb_first(root); node != NULL; node = rb_next(node))
+
+int vbus_notifier_register(struct vbus *vbus, struct notifier_block *nb)
+{
+   int ret;
+   struct rb_node *node;
+
+   mutex_lock(vbus-lock);
+
+   /*
+* resync the client for any devices we might already have
+*/
+   for_each_rbnode(node, vbus-devices.map.root) {
+   struct vbus_device_interface *intf = node_to_intf(node);
+   struct vbus_event_devadd ev = {
+   .type = intf-type,
+   .id   = intf-id,
+   };
+
+   ret = nb-notifier_call(nb, VBUS_EVENT_DEVADD, ev);
+   if (ret  NOTIFY_STOP_MASK) {
+   mutex_unlock(vbus-lock);
+   return -EPERM;
+   }
+   }
+
+   ret = raw_notifier_chain_register(vbus-notifier, nb);
+
+   mutex_unlock(vbus-lock);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(vbus_notifier_register);
+
+int vbus_notifier_unregister(struct vbus *vbus, struct notifier_block *nb)
+{
+   int ret;
+
+   mutex_lock(vbus-lock);
+   ret = raw_notifier_chain_unregister(vbus-notifier, nb);
+   mutex_unlock(vbus-lock);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(vbus_notifier_unregister);
+
 static void devshell_release(struct kobject *kobj)
 {
struct vbus_devshell *ds = container_of(kobj,
diff --git a/kernel/vbus/vbus.h b/kernel/vbus/vbus.h
index 1266d69..cd2676b 100644
--- a/kernel/vbus/vbus.h
+++ b/kernel/vbus/vbus.h
@@ -51,6 +51,7 @@ struct vbus {
struct vbus_subdir members;
unsigned long next_id;
struct rb_node node;
+   struct raw_notifier_head notifier;
 };
 
 struct vbus_member {

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 08/17] venet: add the ABI definitions for an 802.x packet interface

2009-03-31 Thread Gregory Haskins
Signed-off-by: Gregory Haskins ghask...@novell.com
---

 include/linux/venet.h |   47 +++
 1 files changed, 47 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/venet.h

diff --git a/include/linux/venet.h b/include/linux/venet.h
new file mode 100644
index 000..ef6b199
--- /dev/null
+++ b/include/linux/venet.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2008 Novell.  All Rights Reserved.
+ *
+ * Virtual-Ethernet adapter
+ *
+ * Author:
+ *  Gregory Haskins ghask...@novell.com
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#ifndef _LINUX_VENET_H
+#define _LINUX_VENET_H
+
+#define VENET_VERSION 1
+
+#define VENET_TYPE virtual-ethernet
+
+#define VENET_QUEUE_RX 0
+#define VENET_QUEUE_TX 1
+
+struct venet_capabilities {
+   __u32 gid;
+   __u32 bits;
+};
+
+/* CAPABILITIES-GROUP 0 */
+/* #define VENET_CAP_FOO0   (No capabilities defined yet, for now) */
+
+#define VENET_FUNC_LINKUP   0
+#define VENET_FUNC_LINKDOWN 1
+#define VENET_FUNC_MACQUERY 2
+#define VENET_FUNC_NEGCAP   3 /* negotiate capabilities */
+#define VENET_FUNC_FLUSHRX  4
+
+#endif /* _LINUX_VENET_H */

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 11/17] venet: add scatter-gather support

2009-03-31 Thread Gregory Haskins
Signed-off-by: Gregory Haskins ghask...@novell.com
---

 drivers/net/vbus-enet.c |  249 +--
 include/linux/venet.h   |   39 +++
 2 files changed, 275 insertions(+), 13 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index e698b3f..8e96c9c 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -42,6 +42,8 @@ static int rx_ringlen = 256;
 module_param(rx_ringlen, int, 0444);
 static int tx_ringlen = 256;
 module_param(tx_ringlen, int, 0444);
+static int sg_enabled = 1;
+module_param(sg_enabled, int, 0444);
 
 #undef PDEBUG /* undef it, just in case */
 #ifdef VBUS_ENET_DEBUG
@@ -64,8 +66,17 @@ struct vbus_enet_priv {
struct vbus_enet_queue rxq;
struct vbus_enet_queue txq;
struct tasklet_struct  txtask;
+   struct {
+   intsg:1;
+   inttso:1;
+   intufo:1;
+   inttso6:1;
+   intecn:1;
+   } flags;
 };
 
+static void vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force);
+
 static struct vbus_enet_priv *
 napi_to_priv(struct napi_struct *napi)
 {
@@ -199,6 +210,93 @@ rx_teardown(struct vbus_enet_priv *priv)
}
 }
 
+static int
+tx_setup(struct vbus_enet_priv *priv)
+{
+   struct ioq *ioq = priv-txq.queue;
+   struct ioq_iterator iter;
+   int i;
+   int ret;
+
+   if (!priv-flags.sg)
+   /*
+* There is nothing to do for a ring that is not using
+* scatter-gather
+*/
+   return 0;
+
+   ret = ioq_iter_init(ioq, iter, ioq_idxtype_valid, 0);
+   BUG_ON(ret  0);
+
+   ret = ioq_iter_seek(iter, ioq_seek_set, 0, 0);
+   BUG_ON(ret  0);
+
+   /*
+* Now populate each descriptor with an empty SG descriptor
+*/
+   for (i = 0; i  tx_ringlen; i++) {
+   struct venet_sg *vsg;
+   size_t iovlen = sizeof(struct venet_iov) * (MAX_SKB_FRAGS-1);
+   size_t len = sizeof(*vsg) + iovlen;
+
+   vsg = kzalloc(len, GFP_KERNEL);
+   if (!vsg)
+   return -ENOMEM;
+
+   iter.desc-cookie = (u64)vsg;
+   iter.desc-len= len;
+   iter.desc-ptr= (u64)__pa(vsg);
+
+   ret = ioq_iter_seek(iter, ioq_seek_next, 0, 0);
+   BUG_ON(ret  0);
+   }
+
+   return 0;
+}
+
+static void
+tx_teardown(struct vbus_enet_priv *priv)
+{
+   struct ioq *ioq = priv-txq.queue;
+   struct ioq_iterator iter;
+   int ret;
+
+   /* forcefully free all outstanding transmissions */
+   vbus_enet_tx_reap(priv, 1);
+
+   if (!priv-flags.sg)
+   /*
+* There is nothing else to do for a ring that is not using
+* scatter-gather
+*/
+   return;
+
+   ret = ioq_iter_init(ioq, iter, ioq_idxtype_valid, 0);
+   BUG_ON(ret  0);
+
+   /* seek to position 0 */
+   ret = ioq_iter_seek(iter, ioq_seek_set, 0, 0);
+   BUG_ON(ret  0);
+
+   /*
+* free each valid descriptor
+*/
+   while (iter.desc-cookie) {
+   struct venet_sg *vsg = (struct venet_sg *)iter.desc-cookie;
+
+   iter.desc-valid = 0;
+   wmb();
+
+   iter.desc-ptr = 0;
+   iter.desc-cookie = 0;
+
+   ret = ioq_iter_seek(iter, ioq_seek_next, 0, 0);
+   BUG_ON(ret  0);
+
+   kfree(vsg);
+   }
+}
+
 /*
  * Open and close
  */
@@ -403,14 +501,67 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device 
*dev)
BUG_ON(ret  0);
BUG_ON(iter.desc-sown);
 
-   /*
-* We simply put the skb right onto the ring.  We will get an interrupt
-* later when the data has been consumed and we can reap the pointers
-* at that time
-*/
-   iter.desc-cookie = (u64)skb;
-   iter.desc-len = (u64)skb-len;
-   iter.desc-ptr = (u64)__pa(skb-data);
+   if (priv-flags.sg) {
+   struct venet_sg *vsg = (struct venet_sg *)iter.desc-cookie;
+   struct scatterlist sgl[MAX_SKB_FRAGS+1];
+   struct scatterlist *sg;
+   int count, maxcount = ARRAY_SIZE(sgl);
+
+   sg_init_table(sgl, maxcount);
+
+   memset(vsg, 0, sizeof(*vsg));
+
+   vsg-cookie = (u64)skb;
+   vsg-len= skb-len;
+
+   if (skb-ip_summed == CHECKSUM_PARTIAL) {
+   vsg-flags  |= VENET_SG_FLAG_NEEDS_CSUM;
+   vsg-csum.start  = skb-csum_start - skb_headroom(skb);
+   vsg-csum.offset = skb-csum_offset;
+   }
+
+   if (skb_is_gso(skb)) {
+   struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+   

[RFC PATCH 10/17] venet-tap: Adds a venet compatible tap device to VBUS

2009-03-31 Thread Gregory Haskins
This module is similar in concept to a tuntap.  A tuntap module provides
a netif() interface on one side, and a char-dev interface on the other.
Packets that ingress on one interface, egress on the other (and vice versa).

This module offers a similar concept, except that it substitues the
char-dev for a VBUS/IOQ interface.  This allows a VBUS compatible entity
(e.g. userspace or a guest) to directly inject and receive packets
from the host/kernel stack.

Thanks to Pat Mullaney for contributing the maxcount modification

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 drivers/Makefile |1 
 drivers/vbus/devices/Kconfig |   17 
 drivers/vbus/devices/Makefile|1 
 drivers/vbus/devices/venet-tap.c | 1365 ++
 kernel/vbus/Kconfig  |   13 
 5 files changed, 1397 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vbus/devices/Kconfig
 create mode 100644 drivers/vbus/devices/Makefile
 create mode 100644 drivers/vbus/devices/venet-tap.c

diff --git a/drivers/Makefile b/drivers/Makefile
index c1bf417..98fab51 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -106,3 +106,4 @@ obj-$(CONFIG_SSB)   += ssb/
 obj-$(CONFIG_VIRTIO)   += virtio/
 obj-$(CONFIG_STAGING)  += staging/
 obj-y  += platform/
+obj-$(CONFIG_VBUS_DEVICES) += vbus/devices/
diff --git a/drivers/vbus/devices/Kconfig b/drivers/vbus/devices/Kconfig
new file mode 100644
index 000..64e4731
--- /dev/null
+++ b/drivers/vbus/devices/Kconfig
@@ -0,0 +1,17 @@
+#
+# Virtual-Bus (VBus) configuration
+#
+
+config VBUS_VENETTAP
+   tristate Virtual-Bus Ethernet Tap Device
+   depends on VBUS_DEVICES
+   default n
+   help
+Provides a virtual ethernet adapter to a vbus, which in turn
+manifests itself as a standard netif based adapter to the
+   kernel.  It can be used similarly to a tuntap device,
+except that the char-dev transport is replaced with a vbus/ioq
+interface.
+
+   If unsure, say N
+
diff --git a/drivers/vbus/devices/Makefile b/drivers/vbus/devices/Makefile
new file mode 100644
index 000..2ea7d2a
--- /dev/null
+++ b/drivers/vbus/devices/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_VBUS_VENETTAP) += venet-tap.o
diff --git a/drivers/vbus/devices/venet-tap.c b/drivers/vbus/devices/venet-tap.c
new file mode 100644
index 000..ccce58e
--- /dev/null
+++ b/drivers/vbus/devices/venet-tap.c
@@ -0,0 +1,1365 @@
+/*
+ * venettap - A 802.x virtual network device based on the VBUS/IOQ interface
+ *
+ * Copyright (C) 2009 Novell, Gregory Haskins ghask...@novell.com
+ *
+ * Derived from the SNULL example from the book Linux Device Drivers by
+ * Alessandro Rubini, Jonathan Corbet, and Greg Kroah-Hartman, published
+ * by O'Reilly  Associates.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include linux/module.h
+#include linux/init.h
+#include linux/moduleparam.h
+
+#include linux/sched.h
+#include linux/kernel.h
+#include linux/slab.h
+#include linux/errno.h
+#include linux/types.h
+#include linux/interrupt.h
+
+#include linux/in.h
+#include linux/netdevice.h
+#include linux/etherdevice.h
+#include linux/ip.h
+#include linux/tcp.h
+#include linux/skbuff.h
+#include linux/ioq.h
+#include linux/vbus.h
+#include linux/freezer.h
+#include linux/kthread.h
+
+#include linux/venet.h
+
+#include linux/in6.h
+#include asm/checksum.h
+
+MODULE_AUTHOR(Gregory Haskins);
+MODULE_LICENSE(GPL);
+
+#undef PDEBUG /* undef it, just in case */
+#ifdef VENETTAP_DEBUG
+#  define PDEBUG(fmt, args...) printk(KERN_DEBUG venet-tap:  fmt, ## args)
+#else
+#  define PDEBUG(fmt, args...) /* not debugging: nothing */
+#endif
+
+static int maxcount = 2048;
+module_param(maxcount, int, 0600);
+MODULE_PARM_DESC(maxcount, maximum size for rx/tx ioq ring);
+
+static void venettap_tx_isr(struct ioq_notifier *notifier);
+static int venettap_rx_thread(void *__priv);
+static int venettap_tx_thread(void *__priv);
+
+struct venettap_queue {
+   struct ioq  *queue;
+   struct ioq_notifier  notifier;
+};
+
+struct venettap;
+
+enum {
+   RX_SCHED,
+   TX_SCHED,
+   TX_NETIF_CONGESTED,
+   TX_IOQ_CONGESTED,
+};
+
+struct venettap {
+   spinlock_t   lock;
+   unsigned charhmac[ETH_ALEN]; /* host-mac 

[RFC PATCH 13/17] x86: allow the irq-vector translation to be determined outside of ioapic

2009-03-31 Thread Gregory Haskins
The ioapic code currently privately manages the mapping between irq
and vector.  This results in some layering violations as the support
for certain MSI operations need this info.  As a result, the MSI
code itself was moved to the ioapic module.  This is not really
optimal.

We now have another need to gain access to the vector assignment on
x86.  However, rather than put yet another inappropriately placed
function into io-apic, lets create a way to export this simple data
and therefore allow the logic to sit closer to where it belongs.

Ideally we should abstract the entire notion of irq-vector management
out of io-apic, but we leave that as an excercise for another day.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 arch/x86/include/asm/irq.h |6 ++
 arch/x86/kernel/io_apic.c  |   25 +
 2 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 592688e..b1726d8 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -40,6 +40,12 @@ extern unsigned int do_IRQ(struct pt_regs *regs);
 extern void init_IRQ(void);
 extern void native_init_IRQ(void);
 
+#ifdef CONFIG_SMP
+extern int set_irq_affinity(int irq, cpumask_t mask);
+#endif
+
+extern int irq_to_vector(int irq);
+
 /* Interrupt vector management */
 extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
 extern int vector_used_by_percpu_irq(unsigned int vector);
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index bc7ac4d..86a2c36 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -614,6 +614,14 @@ set_ioapic_affinity_irq(unsigned int irq, const struct 
cpumask *mask)
 
set_ioapic_affinity_irq_desc(desc, mask);
 }
+
+int set_irq_affinity(int irq, cpumask_t mask)
+{
+   set_ioapic_affinity_irq(irq, mask);
+
+   return 0;
+}
+
 #endif /* CONFIG_SMP */
 
 /*
@@ -3249,6 +3257,23 @@ void destroy_irq(unsigned int irq)
spin_unlock_irqrestore(vector_lock, flags);
 }
 
+int irq_to_vector(int irq)
+{
+   struct irq_cfg *cfg;
+   unsigned long flags;
+   int ret = -ENOENT;
+
+   spin_lock_irqsave(vector_lock, flags);
+
+   cfg = irq_cfg(irq);
+   if (cfg  cfg-vector != 0)
+   ret = cfg-vector;
+
+   spin_unlock_irqrestore(vector_lock, flags);
+
+   return ret;
+}
+
 /*
  * MSI message composition
  */

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 14/17] kvm: add a reset capability

2009-03-31 Thread Gregory Haskins
We need a way to detect if a VM is reset later in the series, so lets
add a capability for userspace to signal a VM reset down to the kernel.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 arch/x86/kvm/x86.c   |1 +
 include/linux/kvm.h  |2 ++
 include/linux/kvm_host.h |6 ++
 virt/kvm/kvm_main.c  |   36 
 4 files changed, 45 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 758b7a1..9b0a649 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -971,6 +971,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
+   case KVM_CAP_RESET:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 0424326..7ffd8f5 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -396,6 +396,7 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_USER_NMI
 #define KVM_CAP_USER_NMI 22
 #endif
+#define KVM_CAP_RESET 23
 
 /*
  * ioctls for VM fds
@@ -429,6 +430,7 @@ struct kvm_trace_rec {
   struct kvm_assigned_pci_dev)
 #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
struct kvm_assigned_irq)
+#define KVM_RESET_IO(KVMIO,  0x67)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bf6f703..506eca1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -17,6 +17,7 @@
 #include linux/preempt.h
 #include linux/marker.h
 #include linux/msi.h
+#include linux/notifier.h
 #include asm/signal.h
 
 #include linux/kvm.h
@@ -132,6 +133,8 @@ struct kvm {
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
 #endif
+
+   struct raw_notifier_head reset_notifier; /* triggers when VM reboots */
 };
 
 /* The guest did something we don't support. */
@@ -158,6 +161,9 @@ void kvm_exit(void);
 void kvm_get_kvm(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
 
+int kvm_reset_notifier_register(struct kvm *kvm, struct notifier_block *nb);
+int kvm_reset_notifier_unregister(struct kvm *kvm, struct notifier_block *nb);
+
 #define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
 #define HPA_ERR_MASK ((hpa_t)1  HPA_MSB)
 static inline int is_error_hpa(hpa_t hpa) { return hpa  HPA_MSB; }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 29a667c..fca2d25 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -868,6 +868,8 @@ static struct kvm *kvm_create_vm(void)
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
kvm_coalesced_mmio_init(kvm);
 #endif
+   RAW_INIT_NOTIFIER_HEAD(kvm-reset_notifier);
+
 out:
return kvm;
 }
@@ -1485,6 +1487,35 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
}
 }
 
+static void kvm_notify_reset(struct kvm *kvm)
+{
+   mutex_lock(kvm-lock);
+   raw_notifier_call_chain(kvm-reset_notifier, 0, kvm);
+   mutex_unlock(kvm-lock);
+}
+
+int kvm_reset_notifier_register(struct kvm *kvm, struct notifier_block *nb)
+{
+   int ret;
+
+   mutex_lock(kvm-lock);
+   ret = raw_notifier_chain_register(kvm-reset_notifier, nb);
+   mutex_unlock(kvm-lock);
+
+   return ret;
+}
+
+int kvm_reset_notifier_unregister(struct kvm *kvm, struct notifier_block *nb)
+{
+   int ret;
+
+   mutex_lock(kvm-lock);
+   ret = raw_notifier_chain_unregister(kvm-reset_notifier, nb);
+   mutex_unlock(kvm-lock);
+
+   return ret;
+}
+
 /*
  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
  */
@@ -1929,6 +1960,11 @@ static long kvm_vm_ioctl(struct file *filp,
break;
}
 #endif
+   case KVM_RESET: {
+   kvm_notify_reset(kvm);
+   r = 0;
+   break;
+   }
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
}

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 15/17] kvm: add dynamic IRQ support

2009-03-31 Thread Gregory Haskins
This patch provides the ability to dynamically declare and map an
interrupt-request handle to an x86 8-bit vector.

Problem Statement: Emulated devices (such as PCI, ISA, etc) have
interrupt routing done via standard PC mechanisms (MP-table, ACPI,
etc).  However, we also want to support a new class of devices
which exist in a new virtualized namespace and therefore should
not try to piggyback on these emulated mechanisms.  Rather, we
create a way to dynamically register interrupt resources that
acts indepent of the emulated counterpart.

On x86, a simplistic view of the interrupt model is that each core
has a local-APIC which can recieve messages from APIC-compliant
routing devices (such as IO-APIC and MSI) regarding details about
an interrupt (such as which vector to raise).  These routing devices
are controlled by the OS so they may translate a physical event
(such as e1000: raise an RX interrupt) to a logical destination
(such as inject IDT vector 46 on core 3).  A dynirq is a virtual
implementation of such a router (think of it as a virtual-MSI, but
without the coupling to an existing standard, such as PCI).

The model is simple: A guest OS can allocate the mapping of IRQ
handle to vector/core in any way it sees fit, and provide this
information to the dynirq module running in the host.  The assigned
IRQ then becomes the sole handle needed to inject an IDT vector
to the guest from a host.  A host entity that wishes to raise an
interrupt simple needs to call kvm_inject_dynirq(irq) and the routing
is performed transparently.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 arch/x86/Kconfig|5 +
 arch/x86/Makefile   |3 
 arch/x86/include/asm/kvm_host.h |9 +
 arch/x86/include/asm/kvm_para.h |   11 +
 arch/x86/kvm/Makefile   |3 
 arch/x86/kvm/dynirq.c   |  329 +++
 arch/x86/kvm/guest/Makefile |2 
 arch/x86/kvm/guest/dynirq.c |   95 +++
 arch/x86/kvm/x86.c  |6 +
 include/linux/kvm.h |1 
 include/linux/kvm_guest.h   |7 +
 include/linux/kvm_host.h|1 
 include/linux/kvm_para.h|1 
 13 files changed, 472 insertions(+), 1 deletions(-)
 create mode 100644 arch/x86/kvm/dynirq.c
 create mode 100644 arch/x86/kvm/guest/Makefile
 create mode 100644 arch/x86/kvm/guest/dynirq.c
 create mode 100644 include/linux/kvm_guest.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3fca247..91fefd5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -446,6 +446,11 @@ config KVM_GUEST
 This option enables various optimizations for running under the KVM
 hypervisor.
 
+config KVM_GUEST_DYNIRQ
+   bool KVM Dynamic IRQ support
+   depends on KVM_GUEST
+   default y
+
 source arch/x86/lguest/Kconfig
 
 config PARAVIRT
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index d1a47ad..d788815 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -147,6 +147,9 @@ core-$(CONFIG_XEN) += arch/x86/xen/
 # lguest paravirtualization support
 core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
 
+# kvm paravirtualization support
+core-$(CONFIG_KVM_GUEST) += arch/x86/kvm/guest/
+
 core-y += arch/x86/kernel/
 core-y += arch/x86/mm/
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 730843d..9ae398a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -346,6 +346,12 @@ struct kvm_mem_alias {
gfn_t target_gfn;
 };
 
+struct kvm_dynirq {
+   spinlock_t lock;
+   struct rb_root map;
+   struct kvm *kvm;
+};
+
 struct kvm_arch{
int naliases;
struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
@@ -363,6 +369,7 @@ struct kvm_arch{
struct iommu_domain *iommu_domain;
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
+   struct kvm_dynirq *dynirq;
struct kvm_pit *vpit;
struct hlist_head irq_ack_notifier_list;
int vapics_in_nmi_mode;
@@ -519,6 +526,8 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
  const void *val, int bytes);
 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
  gpa_t addr, unsigned long *ret);
+int kvm_dynirq_hc(struct kvm_vcpu *vcpu, int nr, gpa_t gpa, size_t len);
+void kvm_free_dynirq(struct kvm *kvm);
 
 extern bool tdp_enabled;
 
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index b8a3305..fba210e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -13,6 +13,7 @@
 #define KVM_FEATURE_CLOCKSOURCE0
 #define KVM_FEATURE_NOP_IO_DELAY   1
 #define KVM_FEATURE_MMU_OP 2
+#define KVM_FEATURE_DYNIRQ 3
 
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
@@ -45,6 +46,16 @@ struct kvm_mmu_op_release_pt {
__u64 pt_phys;
 };
 
+/* Operations for KVM_HC_DYNIRQ */
+#define 

[RFC PATCH 16/17] kvm: Add VBUS support to the host

2009-03-31 Thread Gregory Haskins
This patch adds support for guest access to a VBUS assigned to the same
context as the VM.  It utilizes a IOQ+IRQ to move events from host-guest,
and provides a hypercall interface to move events guest-host.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 arch/x86/include/asm/kvm_para.h |1 
 arch/x86/kvm/Kconfig|9 
 arch/x86/kvm/Makefile   |3 
 arch/x86/kvm/x86.c  |6 
 arch/x86/kvm/x86.h  |   12 
 include/linux/kvm.h |1 
 include/linux/kvm_host.h|   20 +
 include/linux/kvm_para.h|   59 ++
 virt/kvm/kvm_main.c |1 
 virt/kvm/vbus.c | 1307 +++
 10 files changed, 1419 insertions(+), 0 deletions(-)
 create mode 100644 virt/kvm/vbus.c

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index fba210e..19d81e0 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -14,6 +14,7 @@
 #define KVM_FEATURE_NOP_IO_DELAY   1
 #define KVM_FEATURE_MMU_OP 2
 #define KVM_FEATURE_DYNIRQ 3
+#define KVM_FEATURE_VBUS4
 
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b81125f..875e96e 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,15 @@ config KVM_TRACE
  relayfs.  Note the ABI is not considered stable and will be
  modified in future updates.
 
+config KVM_HOST_VBUS
+   bool KVM virtual-bus (VBUS) host-side support
+   depends on KVM
+   select VBUS
+   default n
+   ---help---
+  This option enables host-side support for accessing virtual-bus
+ devices.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/lguest/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d5676f5..f749ec9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -15,6 +15,9 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 
 kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
i8254.o dynirq.o
+ifeq ($(CONFIG_KVM_HOST_VBUS),y)
+kvm-objs += $(addprefix ../../../virt/kvm/, vbus.o)
+endif
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e24f0a5..2369d84 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -996,6 +996,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_CLOCKSOURCE:
r = boot_cpu_has(X86_FEATURE_CONSTANT_TSC);
break;
+   case KVM_CAP_VBUS:
+   r = kvm_vbus_support();
+   break;
default:
r = 0;
break;
@@ -2688,6 +2691,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
case KVM_HC_DYNIRQ:
ret = kvm_dynirq_hc(vcpu, a0, a1, a2);
break;
+   case KVM_HC_VBUS:
+   ret = kvm_vbus_hc(vcpu, a0, a1, a2);
+   break;
default:
ret = -KVM_ENOSYS;
break;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6a4be78..b6c682b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -3,6 +3,18 @@
 
 #include linux/kvm_host.h
 
+#ifdef CONFIG_KVM_HOST_VBUS
+static inline int kvm_vbus_support(void)
+{
+return 1;
+}
+#else
+static inline int kvm_vbus_support(void)
+{
+return 0;
+}
+#endif
+
 static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 {
vcpu-arch.exception.pending = false;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 349d273..077daac 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -398,6 +398,7 @@ struct kvm_trace_rec {
 #endif
 #define KVM_CAP_RESET 23
 #define KVM_CAP_DYNIRQ 24
+#define KVM_CAP_VBUS 25
 
 /*
  * ioctls for VM fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bec9b35..757f998 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -120,6 +120,9 @@ struct kvm {
struct list_head vm_list;
struct kvm_io_bus mmio_bus;
struct kvm_io_bus pio_bus;
+#ifdef CONFIG_KVM_HOST_VBUS
+   struct kvm_vbus *kvbus;
+#endif
struct kvm_vm_stat stat;
struct kvm_arch arch;
atomic_t users_count;
@@ -471,4 +474,21 @@ static inline int mmu_notifier_retry(struct kvm_vcpu 
*vcpu, unsigned long mmu_se
 }
 #endif
 
+#ifdef CONFIG_KVM_HOST_VBUS
+
+int kvm_vbus_hc(struct kvm_vcpu *vcpu, int nr, gpa_t gpa, size_t len);
+void kvm_vbus_release(struct kvm_vbus *kvbus);
+
+#else /* CONFIG_KVM_HOST_VBUS */
+
+static inline int
+kvm_vbus_hc(struct kvm_vcpu *vcpu, int nr, gpa_t gpa, size_t len)
+{
+   return -EINVAL;
+}
+
+#define kvm_vbus_release(kvbus) do {} while (0)
+
+#endif /* CONFIG_KVM_HOST_VBUS */
+
 #endif
diff --git 

[RFC PATCH 12/17] venettap: add scatter-gather support

2009-03-31 Thread Gregory Haskins
Signed-off-by: Gregory Haskins ghask...@novell.com
---

 drivers/vbus/devices/venet-tap.c |  236 +-
 1 files changed, 229 insertions(+), 7 deletions(-)

diff --git a/drivers/vbus/devices/venet-tap.c b/drivers/vbus/devices/venet-tap.c
index ccce58e..0ccb7ed 100644
--- a/drivers/vbus/devices/venet-tap.c
+++ b/drivers/vbus/devices/venet-tap.c
@@ -80,6 +80,13 @@ enum {
TX_IOQ_CONGESTED,
 };
 
+struct venettap;
+
+struct venettap_rx_ops {
+   int (*decode)(struct venettap *priv, void *ptr, int len);
+   int (*import)(struct venettap *, struct sk_buff *, void *, int);
+};
+
 struct venettap {
spinlock_t   lock;
unsigned charhmac[ETH_ALEN]; /* host-mac */
@@ -107,6 +114,12 @@ struct venettap {
struct vbus_memctx  *ctx;
struct venettap_queuerxq;
struct venettap_queuetxq;
+   struct venettap_rx_ops  *rx_ops;
+   struct {
+   struct venet_sg *desc;
+   size_t   len;
+   int  enabled:1;
+   } sg;
int  connected:1;
int  opened:1;
int  link:1;
@@ -288,6 +301,183 @@ venettap_change_mtu(struct net_device *dev, int new_mtu)
 }
 
 /*
+ * ---
+ * Scatter-Gather support
+ * ---
+ */
+
+/* assumes reference to priv-vbus.conn held */
+static int
+venettap_sg_decode(struct venettap *priv, void *ptr, int len)
+{
+   struct venet_sg *vsg;
+   struct vbus_memctx *ctx;
+   int ret;
+
+   /*
+* SG is enabled, so we need to pull in the venet_sg
+* header before we can interpret the rest of the
+* packet
+*
+* FIXME: Make sure this is not too big
+*/
+   if (unlikely(len  priv-vbus.sg.len)) {
+   kfree(priv-vbus.sg.desc);
+   priv-vbus.sg.desc = kzalloc(len, GFP_KERNEL);
+   }
+
+   vsg = priv-vbus.sg.desc;
+   ctx = priv-vbus.ctx;
+
+   ret = ctx-ops-copy_from(ctx, vsg, ptr, len);
+   BUG_ON(ret);
+
+   /*
+* Non GSO type packets should be constrained by the MTU setting
+* on the host
+*/
+   if (!(vsg-flags  VENET_SG_FLAG_GSO)
+(vsg-len  (priv-netif.dev-mtu + ETH_HLEN)))
+   return -1;
+
+   return vsg-len;
+}
+
+/*
+ * venettap_sg_import - import an skb in scatter-gather mode
+ *
+ * assumes reference to priv-vbus.conn held
+ */
+static int
+venettap_sg_import(struct venettap *priv, struct sk_buff *skb,
+  void *ptr, int len)
+{
+   struct venet_sg *vsg = priv-vbus.sg.desc;
+   struct vbus_memctx *ctx = priv-vbus.ctx;
+   int remain = len;
+   int ret;
+   int i;
+
+   PDEBUG(Importing %d bytes in %d segments\n, len, vsg-count);
+
+   for (i = 0; i  vsg-count; i++) {
+   struct venet_iov *iov = vsg-iov[i];
+
+   if (remain  iov-len)
+   return -EINVAL;
+
+   PDEBUG(Segment %d: %p/%d\n, i, iov-ptr, iov-len);
+
+   ret = ctx-ops-copy_from(ctx, skb_tail_pointer(skb),
+(void *)iov-ptr,
+iov-len);
+   if (ret)
+   return -EFAULT;
+
+   skb_put(skb, iov-len);
+   remain -= iov-len;
+   }
+
+   if (vsg-flags  VENET_SG_FLAG_NEEDS_CSUM
+!skb_partial_csum_set(skb, vsg-csum.start, vsg-csum.offset))
+   return -EINVAL;
+
+   if (vsg-flags  VENET_SG_FLAG_GSO) {
+   struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+   PDEBUG(GSO packet detected\n);
+
+   switch (vsg-gso.type) {
+   case VENET_GSO_TYPE_TCPV4:
+   sinfo-gso_type = SKB_GSO_TCPV4;
+   break;
+   case VENET_GSO_TYPE_TCPV6:
+   sinfo-gso_type = SKB_GSO_TCPV6;
+   break;
+   case VENET_GSO_TYPE_UDP:
+   sinfo-gso_type = SKB_GSO_UDP;
+   break;
+   default:
+   PDEBUG(Illegal GSO type: %d\n, vsg-gso.type);
+   priv-netif.stats.rx_frame_errors++;
+   kfree_skb(skb);
+   return -EINVAL;
+   }
+
+   if (vsg-flags  VENET_SG_FLAG_ECN)
+   sinfo-gso_type |= SKB_GSO_TCP_ECN;
+
+   sinfo-gso_size = vsg-gso.size;
+   if (skb_shinfo(skb)-gso_size == 0) {
+   PDEBUG(Illegal GSO size: %d\n, vsg-gso.size);
+   priv-netif.stats.rx_frame_errors++;
+   kfree_skb(skb);
+ 

[RFC PATCH 07/17] ioq: add vbus helpers

2009-03-31 Thread Gregory Haskins
It will be common to map an IOQ over the VBUS shared-memory interfaces,
so lets generalize their setup so we can reuse the pattern.

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 include/linux/vbus_device.h |7 +++
 include/linux/vbus_driver.h |7 +++
 kernel/vbus/Kconfig |2 +
 kernel/vbus/Makefile|1 
 kernel/vbus/proxy.c |   64 +++
 kernel/vbus/shm-ioq.c   |   89 +++
 6 files changed, 170 insertions(+), 0 deletions(-)
 create mode 100644 kernel/vbus/shm-ioq.c

diff --git a/include/linux/vbus_device.h b/include/linux/vbus_device.h
index 705d92e..66990e2 100644
--- a/include/linux/vbus_device.h
+++ b/include/linux/vbus_device.h
@@ -102,6 +102,7 @@
 #include linux/configfs.h
 #include linux/rbtree.h
 #include linux/shm_signal.h
+#include linux/ioq.h
 #include linux/vbus.h
 #include asm/atomic.h
 
@@ -413,4 +414,10 @@ static inline void vbus_connection_put(struct 
vbus_connection *conn)
conn-ops-release(conn);
 }
 
+/*
+ * device-side IOQ helper - dereferences device-shm as an IOQ
+ */
+int vbus_shm_ioq_attach(struct vbus_shm *shm, struct shm_signal *signal,
+   int maxcount, struct ioq **ioq);
+
 #endif /* _LINUX_VBUS_DEVICE_H */
diff --git a/include/linux/vbus_driver.h b/include/linux/vbus_driver.h
index c53e13f..9cfbf60 100644
--- a/include/linux/vbus_driver.h
+++ b/include/linux/vbus_driver.h
@@ -26,6 +26,7 @@
 
 #include linux/device.h
 #include linux/shm_signal.h
+#include linux/ioq.h
 
 struct vbus_device_proxy;
 struct vbus_driver;
@@ -70,4 +71,10 @@ struct vbus_driver {
 int vbus_driver_register(struct vbus_driver *drv);
 void vbus_driver_unregister(struct vbus_driver *drv);
 
+/*
+ * driver-side IOQ helper - allocates device-shm and maps an IOQ on it
+ */
+int vbus_driver_ioq_alloc(struct vbus_device_proxy *dev, int id, int prio,
+ size_t ringsize, struct ioq **ioq);
+
 #endif /* _LINUX_VBUS_DRIVER_H */
diff --git a/kernel/vbus/Kconfig b/kernel/vbus/Kconfig
index 3aaa085..71acd6f 100644
--- a/kernel/vbus/Kconfig
+++ b/kernel/vbus/Kconfig
@@ -6,6 +6,7 @@ config VBUS
bool Virtual Bus
select CONFIGFS_FS
select SHM_SIGNAL
+   select IOQ
default n
help
 Provides a mechansism for declaring virtual-bus objects and binding
@@ -15,6 +16,7 @@ config VBUS
 
 config VBUS_DRIVERS
tristate VBUS Driver support
+   select IOQ
default n
help
 Adds support for a virtual bus model for proxying drivers.
diff --git a/kernel/vbus/Makefile b/kernel/vbus/Makefile
index d028ece..45f6503 100644
--- a/kernel/vbus/Makefile
+++ b/kernel/vbus/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_VBUS) += core.o devclass.o config.o attribute.o map.o client.o
+obj-$(CONFIG_VBUS) += shm-ioq.o
 
 vbus-proxy-objs += proxy.o
 obj-$(CONFIG_VBUS_DRIVERS) += vbus-proxy.o
diff --git a/kernel/vbus/proxy.c b/kernel/vbus/proxy.c
index ea48f00..75b0cb1 100644
--- a/kernel/vbus/proxy.c
+++ b/kernel/vbus/proxy.c
@@ -150,3 +150,67 @@ void vbus_driver_unregister(struct vbus_driver *drv)
 }
 EXPORT_SYMBOL_GPL(vbus_driver_unregister);
 
+/*
+ *-
+ * driver-side IOQ helper
+ *-
+ */
+static void
+vbus_driver_ioq_release(struct ioq *ioq)
+{
+   kfree(ioq-head_desc);
+   kfree(ioq);
+}
+
+static struct ioq_ops vbus_driver_ioq_ops = {
+   .release = vbus_driver_ioq_release,
+};
+
+
+int vbus_driver_ioq_alloc(struct vbus_device_proxy *dev, int id, int prio,
+ size_t count, struct ioq **ioq)
+{
+   struct ioq   *_ioq;
+   struct ioq_ring_head *head = NULL;
+   struct shm_signal*signal = NULL;
+   size_tlen = IOQ_HEAD_DESC_SIZE(count);
+   int   ret = -ENOMEM;
+
+   _ioq = kzalloc(sizeof(*_ioq), GFP_KERNEL);
+   if (!_ioq)
+   goto error;
+
+   head = kzalloc(len, GFP_KERNEL | GFP_DMA);
+   if (!head)
+   goto error;
+
+   head-magic = IOQ_RING_MAGIC;
+   head-ver   = IOQ_RING_VER;
+   head-count = count;
+
+   ret = dev-ops-shm(dev, id, prio, head, len,
+   head-signal, signal, 0);
+   if (ret  0)
+   goto error;
+
+   ioq_init(_ioq,
+vbus_driver_ioq_ops,
+ioq_locality_north,
+head,
+signal,
+count);
+
+   *ioq = _ioq;
+
+   return 0;
+
+ error:
+   kfree(_ioq);
+   kfree(head);
+
+   if (signal)
+   shm_signal_put(signal);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(vbus_driver_ioq_alloc);
diff --git a/kernel/vbus/shm-ioq.c b/kernel/vbus/shm-ioq.c
new file mode 100644
index 000..a627337
--- /dev/null
+++ b/kernel/vbus/shm-ioq.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2009 Novell.  All Rights Reserved.
+ *
+ * IOQ helper for 

[RFC PATCH 09/17] net: Add vbus_enet driver

2009-03-31 Thread Gregory Haskins
Signed-off-by: Gregory Haskins ghask...@novell.com
---

 drivers/net/Kconfig |   13 +
 drivers/net/Makefile|1 
 drivers/net/vbus-enet.c |  706 +++
 3 files changed, 720 insertions(+), 0 deletions(-)
 create mode 100644 drivers/net/vbus-enet.c

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 62d732a..ac9dabd 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -3099,4 +3099,17 @@ config VIRTIO_NET
  This is the virtual network driver for virtio.  It can be used with
   lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
 
+config VBUS_ENET
+   tristate Virtual Ethernet Driver
+   depends on VBUS_DRIVERS
+   help
+  A virtualized 802.x network device based on the VBUS interface.
+  It can be used with any hypervisor/kernel that supports the
+  vbus protocol.
+
+config VBUS_ENET_DEBUG
+bool Enable Debugging
+   depends on VBUS_ENET
+   default n
+
 endif # NETDEVICES
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 471baaf..61db928 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -264,6 +264,7 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
 obj-$(CONFIG_NETXEN_NIC) += netxen/
 obj-$(CONFIG_NIU) += niu.o
 obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
+obj-$(CONFIG_VBUS_ENET) += vbus-enet.o
 obj-$(CONFIG_SFC) += sfc/
 
 obj-$(CONFIG_WIMAX) += wimax/
diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
new file mode 100644
index 000..e698b3f
--- /dev/null
+++ b/drivers/net/vbus-enet.c
@@ -0,0 +1,706 @@
+/*
+ * vbus_enet - A virtualized 802.x network device based on the VBUS interface
+ *
+ * Copyright (C) 2009 Novell, Gregory Haskins ghask...@novell.com
+ *
+ * Derived from the SNULL example from the book Linux Device Drivers by
+ * Alessandro Rubini, Jonathan Corbet, and Greg Kroah-Hartman, published
+ * by O'Reilly  Associates.
+ */
+
+#include linux/module.h
+#include linux/init.h
+#include linux/moduleparam.h
+
+#include linux/sched.h
+#include linux/kernel.h
+#include linux/slab.h
+#include linux/errno.h
+#include linux/types.h
+#include linux/interrupt.h
+
+#include linux/in.h
+#include linux/netdevice.h
+#include linux/etherdevice.h
+#include linux/ip.h
+#include linux/tcp.h
+#include linux/skbuff.h
+#include linux/ioq.h
+#include linux/vbus_driver.h
+
+#include linux/in6.h
+#include asm/checksum.h
+
+#include linux/venet.h
+
+MODULE_AUTHOR(Gregory Haskins);
+MODULE_LICENSE(GPL);
+
+static int napi_weight = 128;
+module_param(napi_weight, int, 0444);
+static int rx_ringlen = 256;
+module_param(rx_ringlen, int, 0444);
+static int tx_ringlen = 256;
+module_param(tx_ringlen, int, 0444);
+
+#undef PDEBUG /* undef it, just in case */
+#ifdef VBUS_ENET_DEBUG
+#  define PDEBUG(fmt, args...) printk(KERN_DEBUG vbus_enet:  fmt, ## args)
+#else
+#  define PDEBUG(fmt, args...) /* not debugging: nothing */
+#endif
+
+struct vbus_enet_queue {
+   struct ioq  *queue;
+   struct ioq_notifier  notifier;
+};
+
+struct vbus_enet_priv {
+   spinlock_t lock;
+   struct net_device *dev;
+   struct vbus_device_proxy  *vdev;
+   struct napi_struct napi;
+   struct net_device_statsstats;
+   struct vbus_enet_queue rxq;
+   struct vbus_enet_queue txq;
+   struct tasklet_struct  txtask;
+};
+
+static struct vbus_enet_priv *
+napi_to_priv(struct napi_struct *napi)
+{
+   return container_of(napi, struct vbus_enet_priv, napi);
+}
+
+static int
+queue_init(struct vbus_enet_priv *priv,
+  struct vbus_enet_queue *q,
+  int qid,
+  size_t ringsize,
+  void (*func)(struct ioq_notifier *))
+{
+   struct vbus_device_proxy *dev = priv-vdev;
+   int ret;
+
+   ret = vbus_driver_ioq_alloc(dev, qid, 0, ringsize, q-queue);
+   if (ret  0)
+   panic(ioq_alloc failed: %d\n, ret);
+
+   if (func) {
+   q-notifier.signal = func;
+   q-queue-notifier = q-notifier;
+   }
+
+   return 0;
+}
+
+static int
+devcall(struct vbus_enet_priv *priv, u32 func, void *data, size_t len)
+{
+   struct vbus_device_proxy *dev = priv-vdev;
+
+   return dev-ops-call(dev, func, data, len, 0);
+}
+
+/*
+ * ---
+ * rx descriptors
+ * ---
+ */
+
+static void
+rxdesc_alloc(struct ioq_ring_desc *desc, size_t len)
+{
+   struct sk_buff *skb;
+
+   len += ETH_HLEN;
+
+   skb = dev_alloc_skb(len + 2);
+   BUG_ON(!skb);
+
+   skb_reserve(skb, 2); /* align IP on 16B boundary */
+
+   desc-cookie = (u64)skb;
+   desc-ptr= (u64)__pa(skb-data);
+   desc-len= len; /* total length  */
+   desc-valid  = 1;
+}
+
+static void
+rx_setup(struct vbus_enet_priv *priv)
+{
+   struct ioq *ioq = priv-rxq.queue;
+   struct ioq_iterator iter;
+   int ret;
+
+   /*
+* We want to iterate on the valid index.  By 

Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Gerry Reno

Gerry Reno wrote:

Tomasz Chmielewski wrote:

Gerry Reno schrieb:


What does:

md5sum /dev/sr0

output?



DVD is Fedora 10 DVD (i386)

Four cases:

# desktop user; DVD unmounted
$ md5sum /dev/sr0
md5sum: /dev/sr0: Input/output error

# desktop user; DVD mounted
$ md5sum /dev/sr0
ff311b322c894aabc4361c4e270f5a3f  /dev/sr0


Download the iso file to your disk and point kvm there.

It's the easiest to do; your problem is not really kvm-specific.


I'll try that but it seems as though the process being owned by root 
is preventing the access to the cdrom.  So isn't that kvm?  Does 
libvirt know this?  I mean never once have we been able to use the 
cdrom from the VM.  Not just for this problem.  This seems to be some 
kind of access problem.



Ok, using an F10 ISO file when get access denied error when trying to 
define the domain.  So I did this:  chcon -t virt_image_t F10-ISO-FILE 
and now the domain defines ok.  BUT, even then using the ISO file we 
still get the same error message as always:

Boot Failure Code:  0003
Boot from CDROM failed:  cannot read the boot disk.
FATAL: No bootable device.


The host runs with SELinux targeted enforce.  So thinking this might be 
an selinux issue I set 'setenforce 0' to put it into permissive mode but 
it made no difference.  Still the same boot error.

Boot Failure Code:  0003
Boot from CDROM failed:  cannot read the boot disk.
FATAL: No bootable device.

Regards,
Gerry

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Gerry Reno

Gerry Reno wrote:

Gerry Reno wrote:

Tomasz Chmielewski wrote:

Gerry Reno schrieb:


What does:

md5sum /dev/sr0

output?



DVD is Fedora 10 DVD (i386)

Four cases:

# desktop user; DVD unmounted
$ md5sum /dev/sr0
md5sum: /dev/sr0: Input/output error

# desktop user; DVD mounted
$ md5sum /dev/sr0
ff311b322c894aabc4361c4e270f5a3f  /dev/sr0


Download the iso file to your disk and point kvm there.

It's the easiest to do; your problem is not really kvm-specific.


I'll try that but it seems as though the process being owned by root 
is preventing the access to the cdrom.  So isn't that kvm?  Does 
libvirt know this?  I mean never once have we been able to use the 
cdrom from the VM.  Not just for this problem.  This seems to be some 
kind of access problem.



Ok, using an F10 ISO file when get access denied error when trying to 
define the domain.  So I did this:  chcon -t virt_image_t F10-ISO-FILE 
and now the domain defines ok.  BUT, even then using the ISO file we 
still get the same error message as always:

Boot Failure Code:  0003
Boot from CDROM failed:  cannot read the boot disk.
FATAL: No bootable device.


The host runs with SELinux targeted enforce.  So thinking this might 
be an selinux issue I set 'setenforce 0' to put it into permissive 
mode but it made no difference.  Still the same boot error.

Boot Failure Code:  0003
Boot from CDROM failed:  cannot read the boot disk.
FATAL: No bootable device.

We even tried setting the 'hvm' boot device to be 'cdrom' and it 
automatically tries to boot from the cdrom but it still gets the same 
0003 error as before.


Is there no way to recovery this VM?  Seems like we fell into a trap 
with no escape.


Regards,
Gerry

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Javier Guerra
On Tue, Mar 31, 2009 at 1:53 PM, Gerry Reno gr...@verizon.net wrote:
 Boot Failure Code:  0003
 Boot from CDROM failed:  cannot read the boot disk.
 FATAL: No bootable device.

your underlying problem is that you can't get libvirt to generate the
appropriate command line.  you really should take it to the libvirt
list

-- 
Javier
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Gerry Reno

Javier Guerra wrote:

On Tue, Mar 31, 2009 at 1:53 PM, Gerry Reno gr...@verizon.net wrote:
  

Boot Failure Code:  0003
Boot from CDROM failed:  cannot read the boot disk.
FATAL: No bootable device.



your underlying problem is that you can't get libvirt to generate the
appropriate command line.  you really should take it to the libvirt
list

  
Ok, can you give me a command line that will work and then I'll take 
that over to libvirt.


Regards,
Gerry

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 15/17] kvm: add dynamic IRQ support

2009-03-31 Thread Avi Kivity

Gregory Haskins wrote:

This patch provides the ability to dynamically declare and map an
interrupt-request handle to an x86 8-bit vector.

Problem Statement: Emulated devices (such as PCI, ISA, etc) have
interrupt routing done via standard PC mechanisms (MP-table, ACPI,
etc).  However, we also want to support a new class of devices
which exist in a new virtualized namespace and therefore should
not try to piggyback on these emulated mechanisms.  Rather, we
create a way to dynamically register interrupt resources that
acts indepent of the emulated counterpart.

On x86, a simplistic view of the interrupt model is that each core
has a local-APIC which can recieve messages from APIC-compliant
routing devices (such as IO-APIC and MSI) regarding details about
an interrupt (such as which vector to raise).  These routing devices
are controlled by the OS so they may translate a physical event
(such as e1000: raise an RX interrupt) to a logical destination
(such as inject IDT vector 46 on core 3).  A dynirq is a virtual
implementation of such a router (think of it as a virtual-MSI, but
without the coupling to an existing standard, such as PCI).

The model is simple: A guest OS can allocate the mapping of IRQ
handle to vector/core in any way it sees fit, and provide this
information to the dynirq module running in the host.  The assigned
IRQ then becomes the sole handle needed to inject an IDT vector
to the guest from a host.  A host entity that wishes to raise an
interrupt simple needs to call kvm_inject_dynirq(irq) and the routing
is performed transparently.
  


A major disadvantage of dynirq is that it will only work on guests which 
have been ported to it.  So this will only be useful on newer Linux, and 
will likely never work with Windows guests.


Why is having an emulated PCI device so bad?  We found that it has 
several advantages:

- works with all guests
- supports hotplug/hotunplug, udev, sysfs, module autoloading, ...
- supported in all OSes
- someone else maintains it

See also the kvm irq routing work, merged into 2.6.30, which does a 
small part of what you're describing (the sole handle part, specifically).


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM-74: HELP PLEASE - cannot boot from cdrom for recovery

2009-03-31 Thread Javier Guerra
On Tue, Mar 31, 2009 at 2:13 PM, Gerry Reno gr...@verizon.net wrote:
 Javier Guerra wrote:

 your underlying problem is that you can't get libvirt to generate the
 appropriate command line.  you really should take it to the libvirt
 list



 Ok, can you give me a command line that will work and then I'll take that
 over to libvirt.

try this:

/usr/bin/qemu-kvm -S -M pc -m 512 -smp 2 -name MX_3 -monitor pty -boot
d -drive file=/var/vm/vm1/qemu/images/MX_3/MX_3.img,if=ide,index=0,boot=on
-cdrom /dev/sr0 -net nic,macaddr=00:0c:29:e3:bc:ee,vlan=0 -net
tap,fd=17,script=,vlan=0,ifname=vnet1 -serial none -parallel none -usb
-vnc 127.0.0.1:1 -k en-us




-- 
Javier
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 15/17] kvm: add dynamic IRQ support

2009-03-31 Thread Gregory Haskins
Avi Kivity wrote:
 Gregory Haskins wrote:
 This patch provides the ability to dynamically declare and map an
 interrupt-request handle to an x86 8-bit vector.

 Problem Statement: Emulated devices (such as PCI, ISA, etc) have
 interrupt routing done via standard PC mechanisms (MP-table, ACPI,
 etc).  However, we also want to support a new class of devices
 which exist in a new virtualized namespace and therefore should
 not try to piggyback on these emulated mechanisms.  Rather, we
 create a way to dynamically register interrupt resources that
 acts indepent of the emulated counterpart.

 On x86, a simplistic view of the interrupt model is that each core
 has a local-APIC which can recieve messages from APIC-compliant
 routing devices (such as IO-APIC and MSI) regarding details about
 an interrupt (such as which vector to raise).  These routing devices
 are controlled by the OS so they may translate a physical event
 (such as e1000: raise an RX interrupt) to a logical destination
 (such as inject IDT vector 46 on core 3).  A dynirq is a virtual
 implementation of such a router (think of it as a virtual-MSI, but
 without the coupling to an existing standard, such as PCI).

 The model is simple: A guest OS can allocate the mapping of IRQ
 handle to vector/core in any way it sees fit, and provide this
 information to the dynirq module running in the host.  The assigned
 IRQ then becomes the sole handle needed to inject an IDT vector
 to the guest from a host.  A host entity that wishes to raise an
 interrupt simple needs to call kvm_inject_dynirq(irq) and the routing
 is performed transparently.
   

 A major disadvantage of dynirq is that it will only work on guests
 which have been ported to it.  So this will only be useful on newer
 Linux, and will likely never work with Windows guests.

 Why is having an emulated PCI device so bad?  We found that it has
 several advantages:
 - works with all guests
 - supports hotplug/hotunplug, udev, sysfs, module autoloading, ...
 - supported in all OSes
 - someone else maintains it
These points are all valid, and I really struggled with this particular
part of the design.  The entire vbus design only requires one IRQ for
the entire guest, so its conceivable that I could present a simple
dummy PCI device with some VBUS type PCI-ID, just to piggy back on
the IRQ routing logic.  Then userspace could simply pass the IRQ routing
info down to the kernel with an ioctl, or something similar.

Ultimately I wasn't sure whether I wanted all that goo just to get an
IRQ assignment...but on the other hand, we have all this goo to build
one in the first place, and its half on the guest side which has the
disadvantages you mention.  So perhaps this should go in favor of a
PCI-esqe type solution, as I think you are suggesting.

I think ultimately I was trying to stay away from PCI in general because
I want to support environments that do not have PCI.  However, for the
kvm-transport case (at least on x86) this isnt really a constraint.


 See also the kvm irq routing work, merged into 2.6.30, which does a
 small part of what you're describing (the sole handle part,
 specifically).

I will take a look, thanks!

(I wish I wish you had accepted those irq patches I wrote a while back. 
It had the foundation for this type of stuff all built in.  But alas, I
think it was before its time, and I didn't do a good job of explaining
my future plans) ;)

Regards,
-Greg






signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH 13/17] x86: allow the irq-vector translation to be determined outside of ioapic

2009-03-31 Thread Alan Cox
On Tue, 31 Mar 2009 14:43:55 -0400
Gregory Haskins ghask...@novell.com wrote:

 The ioapic code currently privately manages the mapping between irq
 and vector.  This results in some layering violations as the support
 for certain MSI operations need this info.  As a result, the MSI
 code itself was moved to the ioapic module.  This is not really
 optimal.

This appears to have been muddled in with the vnet patches ?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 13/17] x86: allow the irq-vector translation to be determined outside of ioapic

2009-03-31 Thread Gregory Haskins
Alan Cox wrote:
 On Tue, 31 Mar 2009 14:43:55 -0400
 Gregory Haskins ghask...@novell.com wrote:

   
 The ioapic code currently privately manages the mapping between irq
 and vector.  This results in some layering violations as the support
 for certain MSI operations need this info.  As a result, the MSI
 code itself was moved to the ioapic module.  This is not really
 optimal.
 

 This appears to have been muddled in with the vnet patches ?
   
Its needed for the kvm-connector patches later in the series, so it was
included intentionally.

On that topic, I probably should have had a TOC of some kind.  Hmm..let
me hack one together now:

Patch 1: Stand-alone shared-memory signal construct, used by various
components in vbus/venet
Patches 2-5: Basic vbus infrastructure
Patches 6-7: IOQ construct, similar to virtio-ring.  Used to overlay
ring-like behavior over the shm interface in vbus
Patches 8-12: virtual-ethernet front and backends
Patch 13: io-apic work to expose the irq-vector in x86, needed for
dynirq support
Patches 14-16: KVM host side support
Patch 17: KVM guest side support

Sorry for the confusion :(

Regards,
-Greg



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH 14/17] kvm: add a reset capability

2009-03-31 Thread Gregory Haskins
Avi Kivity wrote:
 Gregory Haskins wrote:
 We need a way to detect if a VM is reset later in the series, so lets
 add a capability for userspace to signal a VM reset down to the kernel.
   

 How do you handle the case of a guest calling kexec to load a new
 kernel?  Or is that not important for your use case?


Hmm..I had not considered this.  Any suggestions on ways to detect it?



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH 15/17] kvm: add dynamic IRQ support

2009-03-31 Thread Avi Kivity

Gregory Haskins wrote:

- works with all guests
- supports hotplug/hotunplug, udev, sysfs, module autoloading, ...
- supported in all OSes
- someone else maintains it


These points are all valid, and I really struggled with this particular
part of the design.  The entire vbus design only requires one IRQ for
the entire guest,


Won't this have scaling issues?  One IRQ means one target vcpu.  Whereas 
I'd like virtio devices to span multiple queues, each queue with its own 
MSI IRQ.  Also, the single IRQ handler will need to scan for all 
potential IRQ sources.  Even if implemented carefully, this will cause 
many cacheline bounces.



 so its conceivable that I could present a simple
dummy PCI device with some VBUS type PCI-ID, just to piggy back on
the IRQ routing logic.  Then userspace could simply pass the IRQ routing
info down to the kernel with an ioctl, or something similar.
  


Xen does something similar, I believe.


I think ultimately I was trying to stay away from PCI in general because
I want to support environments that do not have PCI.  However, for the
kvm-transport case (at least on x86) this isnt really a constraint.

  


s/PCI/the native IRQ solution for your platform/. virtio has the same 
problem; on s390 we use the native (if that word ever applies to s390) 
interrupt and device discovery mechanism.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 14/17] kvm: add a reset capability

2009-03-31 Thread Avi Kivity

Gregory Haskins wrote:

Avi Kivity wrote:
  

Gregory Haskins wrote:


We need a way to detect if a VM is reset later in the series, so lets
add a capability for userspace to signal a VM reset down to the kernel.
  
  

How do you handle the case of a guest calling kexec to load a new
kernel?  Or is that not important for your use case?




Hmm..I had not considered this.  Any suggestions on ways to detect it?

  


Best would be not to detect it; it's tying global events into a device.  
Instead, have a reset command for your device and have the driver issue 
it on load and unload.


btw, reset itself would be better controlled from userspace; qemu knows 
about resets and can reset vbus devices directly instead of relying on 
kvm to reset them.  This decouples the two code bases a bit.  This is 
what virtio does.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 00/17] virtual-bus

2009-03-31 Thread Andi Kleen
Gregory Haskins ghask...@novell.com writes:

What might be useful is if you could expand a bit more on what the high level
use cases for this. 

Questions that come to mind and that would be good to answer:

This seems to be aimed at having multiple VMs talk
to each other, but not talk to the rest of the world, correct? 
Is that a common use case? 

Wouldn't they typically have a default route  anyways and be able to talk to 
each 
other this way? 
And why can't any such isolation be done with standard firewalling? (it's known 
that 
current iptables has some scalability issues, but there's work going on right
now to fix that). 

What would be the use cases for non networking devices?

How would the interfaces to the user look like?

-Andi

-- 
a...@linux.intel.com -- Speaking for myself only.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: problems with live migration using kvm-84

2009-03-31 Thread Gerrit Slomma
I updated my bugreport at https://bugzilla.redhat.com/show_bug.cgi?id=492688#c8
When migrating with -no-kvm everything works fine without stacktraces.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 15/17] kvm: add dynamic IRQ support

2009-03-31 Thread Gregory Haskins
Avi Kivity wrote:
 Gregory Haskins wrote:
 - works with all guests
 - supports hotplug/hotunplug, udev, sysfs, module autoloading, ...
 - supported in all OSes
 - someone else maintains it
 
 These points are all valid, and I really struggled with this particular
 part of the design.  The entire vbus design only requires one IRQ for
 the entire guest,

 Won't this have scaling issues?  One IRQ means one target vcpu. 
 Whereas I'd like virtio devices to span multiple queues, each queue
 with its own MSI IRQ.
Hmm..you know I hadnt really thought of it that way, but you have a
point.  To clarify, my design actually uses one IRQ per eventq, where
we can have an arbitrary number of eventq's defined (note: today I only
define one eventq, however).  An eventq is actually a shm-ring construct
where I can pass events up to the host like device added or ring X
signaled.  Each individual device based virtio-ring would then
aggregates signal events onto this eventq mechanism to actually inject
events to the host.  Only the eventq itself injects an actual IRQ to the
assigned vcpu.

My intended use of multiple eventqs was for prioritization of different
rings.  For instance, we could define 8 priority levels, each with its
own ring/irq.  That way, a virtio-net that supports something like
802.1p could define 8 virtio-rings, one for each priority level.

But this scheme is more targeted at prioritization than per vcpu
irq-balancing.  I support the eventq construct I proposed could still be
used in this fashion since each has its own routable IRQ.  However, I
would have to think about that some more because it is beyond the design
spec.

The good news is that the decision to use the eventq+irq approach is
completely contained in the kvm-host+guest.patch.  We could easily
switch to a 1:1 irq:shm-signal if we wanted to, and the device/drivers
would work exactly the same without modification.

   Also, the single IRQ handler will need to scan for all potential IRQ
 sources.  Even if implemented carefully, this will cause many
 cacheline bounces.
Well, no, I think this part is covered.  As mentioned above, we use a
queuing technique so there is no scanning needed.  Ultimately I would
love to adapt a similar technique to optionally replace the LAPIC.  That
way we can avoid the EOI trap and just consume the next interrupt (if
applicable) from the shm-ring.


  so its conceivable that I could present a simple
 dummy PCI device with some VBUS type PCI-ID, just to piggy back on
 the IRQ routing logic.  Then userspace could simply pass the IRQ routing
 info down to the kernel with an ioctl, or something similar.
   

 Xen does something similar, I believe.

 I think ultimately I was trying to stay away from PCI in general because
 I want to support environments that do not have PCI.  However, for the
 kvm-transport case (at least on x86) this isnt really a constraint.

   

 s/PCI/the native IRQ solution for your platform/. virtio has the same
 problem; on s390 we use the native (if that word ever applies to s390)
 interrupt and device discovery mechanism.

yeah, I agree.  We can contain the exposure of PCI to just platforms
within KVM that care about it.

-Greg




signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH 14/17] kvm: add a reset capability

2009-03-31 Thread Gregory Haskins
Avi Kivity wrote:
 Gregory Haskins wrote:
 Avi Kivity wrote:
  
 Gregory Haskins wrote:

 We need a way to detect if a VM is reset later in the series, so lets
 add a capability for userspace to signal a VM reset down to the
 kernel.
 
 How do you handle the case of a guest calling kexec to load a new
 kernel?  Or is that not important for your use case?

 

 Hmm..I had not considered this.  Any suggestions on ways to detect it?

   

 Best would be not to detect it; it's tying global events into a
 device.  Instead, have a reset command for your device and have the
 driver issue it on load and unload.

Yes, good point.  This is doable within the existing infrastructure, but
it would have to be declared in each devices ABI definition.  I could
make it more formal and add it to the list of low-level bus-verbs, like
DEVICEOPEN, DEVICECLOSE, etc.


 btw, reset itself would be better controlled from userspace; qemu
 knows about resets and can reset vbus devices directly instead of
 relying on kvm to reset them.
In a way, this is what I have done (note to self: post the userspace
patches)

The detection is done by userspace, and it invokes an ioctl.  The kernel
based devices then react if they are interested.  In my case, vbus
registers for reset-notification, and it acts as if the guest exited
when it gets reset (e.g. it issues DEVICECLOSE verbs to all devices the
guest had open).




signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH 09/17] net: Add vbus_enet driver

2009-03-31 Thread Stephen Hemminger
On Tue, 31 Mar 2009 14:43:34 -0400
Gregory Haskins ghask...@novell.com wrote:

 Signed-off-by: Gregory Haskins ghask...@novell.com
 ---
 
  drivers/net/Kconfig |   13 +
  drivers/net/Makefile|1 
  drivers/net/vbus-enet.c |  706 
 +++
  3 files changed, 720 insertions(+), 0 deletions(-)
  create mode 100644 drivers/net/vbus-enet.c
 
 diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
 index 62d732a..ac9dabd 100644
 --- a/drivers/net/Kconfig
 +++ b/drivers/net/Kconfig
 @@ -3099,4 +3099,17 @@ config VIRTIO_NET
 This is the virtual network driver for virtio.  It can be used with
lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
  
 +config VBUS_ENET
 + tristate Virtual Ethernet Driver
 + depends on VBUS_DRIVERS
 + help
 +A virtualized 802.x network device based on the VBUS interface.
 +It can be used with any hypervisor/kernel that supports the
 +vbus protocol.
 +
 +config VBUS_ENET_DEBUG
 +bool Enable Debugging
 + depends on VBUS_ENET
 + default n
 +
  endif # NETDEVICES
 diff --git a/drivers/net/Makefile b/drivers/net/Makefile
 index 471baaf..61db928 100644
 --- a/drivers/net/Makefile
 +++ b/drivers/net/Makefile
 @@ -264,6 +264,7 @@ obj-$(CONFIG_FS_ENET) += fs_enet/
  obj-$(CONFIG_NETXEN_NIC) += netxen/
  obj-$(CONFIG_NIU) += niu.o
  obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
 +obj-$(CONFIG_VBUS_ENET) += vbus-enet.o
  obj-$(CONFIG_SFC) += sfc/
  
  obj-$(CONFIG_WIMAX) += wimax/
 diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
 new file mode 100644
 index 000..e698b3f
 --- /dev/null
 +++ b/drivers/net/vbus-enet.c
 @@ -0,0 +1,706 @@
 +/*
 + * vbus_enet - A virtualized 802.x network device based on the VBUS interface
 + *
 + * Copyright (C) 2009 Novell, Gregory Haskins ghask...@novell.com
 + *
 + * Derived from the SNULL example from the book Linux Device Drivers by
 + * Alessandro Rubini, Jonathan Corbet, and Greg Kroah-Hartman, published
 + * by O'Reilly  Associates.
 + */
 +
 +#include linux/module.h
 +#include linux/init.h
 +#include linux/moduleparam.h
 +
 +#include linux/sched.h
 +#include linux/kernel.h
 +#include linux/slab.h
 +#include linux/errno.h
 +#include linux/types.h
 +#include linux/interrupt.h
 +
 +#include linux/in.h
 +#include linux/netdevice.h
 +#include linux/etherdevice.h
 +#include linux/ip.h
 +#include linux/tcp.h
 +#include linux/skbuff.h
 +#include linux/ioq.h
 +#include linux/vbus_driver.h
 +
 +#include linux/in6.h
 +#include asm/checksum.h
 +
 +#include linux/venet.h
 +
 +MODULE_AUTHOR(Gregory Haskins);
 +MODULE_LICENSE(GPL);
 +
 +static int napi_weight = 128;
 +module_param(napi_weight, int, 0444);
 +static int rx_ringlen = 256;
 +module_param(rx_ringlen, int, 0444);
 +static int tx_ringlen = 256;
 +module_param(tx_ringlen, int, 0444);
 +
 +#undef PDEBUG /* undef it, just in case */
 +#ifdef VBUS_ENET_DEBUG
 +#  define PDEBUG(fmt, args...) printk(KERN_DEBUG vbus_enet:  fmt, ## args)
 +#else
 +#  define PDEBUG(fmt, args...) /* not debugging: nothing */
 +#endif
 +
 +struct vbus_enet_queue {
 + struct ioq  *queue;
 + struct ioq_notifier  notifier;
 +};
 +
 +struct vbus_enet_priv {
 + spinlock_t lock;
 + struct net_device *dev;
 + struct vbus_device_proxy  *vdev;
 + struct napi_struct napi;
 + struct net_device_statsstats;

Not needed any more, stats are available in net_device

 + struct vbus_enet_queue rxq;
 + struct vbus_enet_queue txq;
 + struct tasklet_struct  txtask;
 +};
 +

 + * Ioctl commands
 + */
 +static int
 +vbus_enet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 +{
 + PDEBUG(ioctl\n);
 + return 0;
 +}

If it doesn't do ioctl, just leave pointer as NULL

 +/*
 + * Return statistics to the caller
 + */
 +static struct net_device_stats *
 +vbus_enet_stats(struct net_device *dev)
 +{
 + struct vbus_enet_priv *priv = netdev_priv(dev);
 + return priv-stats;
 +}

Not needed if you use internal net_device stats

 +static void
 +rx_isr(struct ioq_notifier *notifier)
 +{
 + struct vbus_enet_priv *priv;
 + struct net_device  *dev;
 +
 + priv = container_of(notifier, struct vbus_enet_priv, rxq.notifier);
 + dev = priv-dev;
 +
 + if (!ioq_empty(priv-rxq.queue, ioq_idxtype_inuse))
 + vbus_enet_schedule_rx(priv);
 +}
 +
 +static void
 +deferred_tx_isr(unsigned long data)
 +{
 + struct vbus_enet_priv *priv = (struct vbus_enet_priv *)data;
 + unsigned long flags;
 +
 + PDEBUG(deferred_tx_isr for %lld\n, priv-vdev-id);
 +
 + spin_lock_irqsave(priv-lock, flags);
 + vbus_enet_tx_reap(priv, 0);
 + spin_unlock_irqrestore(priv-lock, flags);
 +
 + ioq_notify_enable(priv-txq.queue, 0);
 +}
 +
 +static void
 +tx_isr(struct ioq_notifier *notifier)
 +{
 +   struct vbus_enet_priv *priv;
 +   unsigned long flags;
 +
 +   priv = 

Re: [RFC PATCH 01/17] shm-signal: shared-memory signals

2009-03-31 Thread Avi Kivity

Gregory Haskins wrote:

This interface provides a bidirectional shared-memory based signaling
mechanism.  It can be used by any entities which desire efficient
communication via shared memory.  The implementation details of the
signaling are abstracted so that they may transcend a wide variety
of locale boundaries (e.g. userspace/kernel, guest/host, etc).

The shm_signal mechanism supports event masking as well as spurious
event delivery mitigation.
+
+/*
+ *-
+ * The following structures represent data that is shared across boundaries
+ * which may be quite disparate from one another (e.g. Windows vs Linux,
+ * 32 vs 64 bit, etc).  Therefore, care has been taken to make sure they
+ * present data in a manner that is independent of the environment.
+ *---
+ */
+
+#define SHM_SIGNAL_MAGIC 0x58fa39df
+#define SHM_SIGNAL_VER   1
+
+struct shm_signal_irq {
+   __u8  enabled;
+   __u8  pending;
+   __u8  dirty;
+};
  


Some ABIs may choose to pad this, suggest explicit padding.


+
+enum shm_signal_locality {
+   shm_locality_north,
+   shm_locality_south,
+};
+
+struct shm_signal_desc {
+   __u32 magic;
+   __u32 ver;
+   struct shm_signal_irq irq[2];
+};
  


Similarly, this should be padded to 0 (mod 8).

Instead of versions, I prefer feature flags which can be independently 
enabled or disabled.



+
+/* --- END SHARED STRUCTURES --- */
+
+#ifdef __KERNEL__
+
+#include linux/interrupt.h
+
+struct shm_signal_notifier {
+   void (*signal)(struct shm_signal_notifier *);
+};
  


This means -inject() has been called from the other side?

(reading below I see this is so.  not used to reading well commented 
code... :)



+
+struct shm_signal;
+
+struct shm_signal_ops {
+   int  (*inject)(struct shm_signal *s);
+   void (*fault)(struct shm_signal *s, const char *fmt, ...);
  


Eww.  Must we involve strings and printf formats?


+   void (*release)(struct shm_signal *s);
+};
+
+/*
+ * signaling protocol:
+ *
+ * each side of the shm_signal has an irq structure with the following
+ * fields:
+ *
+ *- enabled: controlled by shm_signal_enable/disable() to mask/unmask
+ *   the notification locally
+ *- dirty:   indicates if the shared-memory is dirty or clean.  This
+ *   is updated regardless of the enabled/pending state so that
+ *   the state is always accurately tracked.
+ *- pending: indicates if a signal is pending to the remote locale.
+ *   This allows us to determine if a remote-notification is
+ *   already in flight to optimize spurious notifications away.
+ */
  


When you overlay a ring on top of this, won't the ring indexes convey 
the same information as -dirty?



--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: problems with live migration using kvm-84

2009-03-31 Thread Gerrit Slomma
Gerrit Slomma roadrunner_gs at web.de writes:

 
 I updated my bugreport at 
 https://bugzilla.redhat.com/show_bug.cgi?id=492688#c8
 When migrating with -no-kvm everything works fine without stacktraces.

Seems like -no-kvm is emulating a AMD on my Intel-Hosts...

host

rr016# grep vendor_id /proc/cpuinfo
vendor_id   : GenuineIntel
vendor_id   : GenuineIntel

virtual machine

rr019v4#  grep vendor_id /proc/cpuinfo
vendor_id   : AuthenticAMD
vendor_id   : AuthenticAMD

host

rr017# grep vendor_id /proc/cpuinfo
vendor_id   : GenuineIntel
vendor_id   : GenuineIntel

virtual machine

rr019v4#  grep vendor_id /proc/cpuinfo
vendor_id   : AuthenticAMD
vendor_id   : AuthenticAMD

whereas without -no-kvm it is Intel

virtual machine

rr019v4# grep vendor_id /proc/cpuinfo
vendor_id   : GenuineIntel
vendor_id   : GenuineIntel

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >