date:20091207

Re: [PATCH 2/4] KVM: Add accessor for reading cr4 (or some bits of cr4)

2009-12-07 Thread Sheng Yang

On Monday 07 December 2009 18:47:10 Avi Kivity wrote:
> Some bits of cr4 can be owned by the guest on vmx, so when we read them,
> we copy them to the vcpu structure.  In preparation for making the set of
> guest-owned bits dynamic, use helpers to access these bits so we don't need
> to know where the bit resides.
> 
> No changes to svm since all bits are host-owned there.
> 
> Signed-off-by: Avi Kivity 
> ---
>  arch/x86/include/asm/kvm_host.h |1 +
>  arch/x86/kvm/kvm_cache_regs.h   |   12 
>  arch/x86/kvm/mmu.h  |5 +++--
>  arch/x86/kvm/vmx.c  |   13 -
>  arch/x86/kvm/x86.c  |   16 ++--
>  5 files changed, 30 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h
>  b/arch/x86/include/asm/kvm_host.h index da6dee8..e9f4f12 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -272,6 +272,7 @@ struct kvm_vcpu_arch {
>   unsigned long cr2;
>   unsigned long cr3;
>   unsigned long cr4;
> + unsigned long cr4_guest_owned_bits;
>   unsigned long cr8;
>   u32 hflags;
>   u64 pdptrs[4]; /* pae */
> diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
> index 7bcc5b6..35acc36 100644
> --- a/arch/x86/kvm/kvm_cache_regs.h
> +++ b/arch/x86/kvm/kvm_cache_regs.h
> @@ -38,4 +38,16 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu,
>  int index) return vcpu->arch.pdptrs[index];
>  }
> 
> +static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
> +{
> + if (mask & vcpu->arch.cr4_guest_owned_bits)
> + kvm_x86_ops->decache_cr4_guest_bits(vcpu);
> + return vcpu->arch.cr4 & mask;
> +}
> +
> +static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
> +{
> + return kvm_read_cr4_bits(vcpu, ~0UL);
> +}
> +
>  #endif
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 61a1b38..4567d80 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -2,6 +2,7 @@
>  #define __KVM_X86_MMU_H
> 
>  #include 
> +#include "kvm_cache_regs.h"
> 
>  #define PT64_PT_BITS 9
>  #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
> @@ -64,12 +65,12 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
> 
>  static inline int is_pae(struct kvm_vcpu *vcpu)
>  {
> - return vcpu->arch.cr4 & X86_CR4_PAE;
> + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
>  }
> 
>  static inline int is_pse(struct kvm_vcpu *vcpu)
>  {
> - return vcpu->arch.cr4 & X86_CR4_PSE;
> + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
>  }
> 
>  static inline int is_paging(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 5ef820e..ae95a0c 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -1612,8 +1612,10 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
> 
>  static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
>  {
> - vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
> - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
> + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
> +
> + vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
> + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
>  }
> 
>  static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
> @@ -1658,7 +1660,7 @@ static void ept_update_paging_mode_cr0(unsigned long
>  *hw_cr0, (CPU_BASED_CR3_LOAD_EXITING |
> CPU_BASED_CR3_STORE_EXITING));
>   vcpu->arch.cr0 = cr0;
> - vmx_set_cr4(vcpu, vcpu->arch.cr4);
> + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
>   } else if (!is_paging(vcpu)) {
>   /* From nonpaging to paging */
>   vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
> @@ -1666,7 +1668,7 @@ static void ept_update_paging_mode_cr0(unsigned long
>  *hw_cr0, ~(CPU_BASED_CR3_LOAD_EXITING |
>  CPU_BASED_CR3_STORE_EXITING));
>   vcpu->arch.cr0 = cr0;
> - vmx_set_cr4(vcpu, vcpu->arch.cr4);
> + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
>   }

Another place accessed cr4 directly, in ept_update_paging_mode_cr4()

>} else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
>*hw_cr4 &= ~X86_CR4_PAE;   

Others looks fine to me.

-- 
regards
Yang, Sheng
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Autotest] [PATCH] Add a server-side test - kvm_migration

2009-12-07 Thread Yolkfull Chow

On Mon, Dec 07, 2009 at 03:35:54PM +0530, sudhir kumar wrote:
> Resending with proper cc list :(
> 
> On Mon, Dec 7, 2009 at 2:43 PM, sudhir kumar  wrote:
> > Thanks for initiating the server side implementation of migration. Few
> > comments below
> >
> > On Fri, Dec 4, 2009 at 1:48 PM, Yolkfull Chow  wrote:
> >> This patch will add a server-side test namely kvm_migration. Currently,
> >> it will use existing KVM client test framework and add a new file
> >> kvm_migration.py to help judge executing routine: source machine or dest
> >> machine.
> >>
> >> * One thing need to be considered/improved:
> >> Whether we parse the kvm_tests.cfg on server machine or on client machines?
> >> If parse it on client machines, we need to fix one problem that adding
> >> 'start_vm_for_migration' parameter into dict which generated on dest 
> >> machine.
> > I think we can not manage with client side parsing without adding too
> > much complexity. So let us continue parsing on the server side only
> > for remote migration. Also as the patch does, keep the local migration
> > under the client also. I do not like adding test variants in
> > migration_control.srv. Comments below...
> >>
> >> So far I choose parsing kvm_tests.cfg on server machine, and then add
> >> 'start_vm_for_migration' into dict cloned from original test dict for dest
> >> machine.
> >>
> >> * In order to run this test so far, we need to setup NFS for both
> >> source and dest machines.
> >>
> >> Signed-off-by: Yolkfull Chow 
> >> ---
> >>  client/tests/kvm/kvm_migration.py      |  165 
> >> 
> >>  client/tests/kvm/kvm_test_utils.py     |   27 +++---
> >>  client/tests/kvm/kvm_tests.cfg.sample  |    2 +
> >>  client/tests/kvm_migration             |    1 +
> >>  server/tests/kvm/migration_control.srv |  137 ++
> >>  5 files changed, 320 insertions(+), 12 deletions(-)
> >>  create mode 100644 client/tests/kvm/kvm_migration.py
> >>  create mode 12 client/tests/kvm_migration
> >>  create mode 100644 server/tests/kvm/migration_control.srv
> >>
> >> diff --git a/client/tests/kvm/kvm_migration.py 
> >> b/client/tests/kvm/kvm_migration.py
> >> new file mode 100644
> >> index 000..52cd3cd
> >> --- /dev/null
> >> +++ b/client/tests/kvm/kvm_migration.py
> >> @@ -0,0 +1,165 @@
> >> +import sys, os, time, logging, commands, socket
> >> +from autotest_lib.client.bin import test
> >> +from autotest_lib.client.common_lib import error
> >> +import kvm_utils, kvm_preprocessing, common, kvm_vm, kvm_test_utils
> >> +
> >> +
> >> +class kvm_migration(test.test):
> >> +    """
> >> +    KVM migration test.
> >> +
> >> +   �...@copyright: Red Hat 2008-2009
> >> +   �...@see: http://www.linux-kvm.org/page/KVM-Autotest/Client_Install
> >> +            (Online doc - Getting started with KVM testing)
> >> +
> >> +    Migration execution progress:
> >> +
> >> +    source host                     dest host
> >> +    --
> >> +    log into guest
> >> +    --
> >> +    start socket server
> >> +
> >> +     wait 30 secs -- wait login_timeout+30 secs---
> >> +
> >> +    accept connection             connect to socket server,send mig_port
> >> +    --
> >> +    start migration
> >> +
> >> +     wait 30 secs -- wait mig_timeout+30 secs-
> >> +
> >> +                                  try to log into migrated guest
> >> +    --
> >> +
> >> +    """
> >> +    version = 1
> >> +    def initialize(self):
> >> +        pass
> >> +
> >> +
> >> +    def run_once(self, params):
> >> +        """
> >> +        Setup remote machine and then execute migration.
> >> +        """
> >> +        # Check whether remote machine is ready
> >> +        dsthost = params.get("dsthost")
> >> +        srchost = params.get("srchost")
> >> +        image_path = os.path.join(self.bindir, "images")
> >> +
> >> +        rootdir = params.get("rootdir")
> >> +        iso = os.path.join(rootdir, 'iso')
> >> +        images = os.path.join(rootdir, 'images')
> >> +        qemu = os.path.join(rootdir, 'qemu')
> >> +        qemu_img = os.path.join(rootdir, 'qemu-img')
> >> +
> >> +        def link_if_not_exist(ldir, target, link_name):
> >> +            t = target
> >> +            l = os.path.join(ldir, link_name)
> >> +            if not os.path.exists(l):
> >> +                os.symlink(t,l)
> >> +        link_if_not_exist(self.bindir, '../../', 'autotest')
> >> +        link_if_not_exist(self.bindir, iso, 'isos')
> >> +        link_if_not_exist(self.bindir, images, 'images')
> >> +        link_if_not_exist(self.bindir, qemu, 'qemu')
> >> +        link_if_not_exist(self.bindir, qemu_img, 'qemu-img')
> >> +
> >> +        # Report the parameters we've received and

Re: [PATCH] virtio spec: add virtio-blk max sectors feature

2009-12-07 Thread Rusty Russell

On Thu, 3 Dec 2009 08:28:38 pm Avi Kivity wrote:
> On 12/03/2009 10:42 AM, Avishay Traeger1 wrote:
> > I previously submitted a patch to have the guest virtio-blk driver get the
> > value for the maximum I/O size from the host bdrv, rather than assume that
> > there is no limit.  Avi requested that I first patch the virtio spec
> > (http://ozlabs.org/~rusty/virtio-spec/).  Below is that patch.
> >
> > Please CC me on replies, as I am not subscribed.
> >
> >
> 
> Copying Rusty and virtualizat...@.

Thanks Avi...

Avishay; this would be the total sectors in an I/O, as separate from SIZE_MAX
(maximum size of any single scatterlist entry) and SEG_MAX (maximum number of
scatterlist entries)?

Seems like a reasonable idea; esp if you need it.

Thanks!
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[AUTOTEST PATCH] KVM test: subtest block_hotplug: Fixup pci_test_cmd in config file

2009-12-07 Thread Yolkfull Chow

RHEL-4.8 is still using 'hd[a-z]' as harddisk device name. This patch
adds 'h' to regular expression in command `pci_test_cmd'.

Signed-off-by: Yolkfull Chow 
---
 client/tests/kvm/kvm_tests.cfg.sample |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/client/tests/kvm/kvm_tests.cfg.sample 
b/client/tests/kvm/kvm_tests.cfg.sample
index 20ae332..73c593a 100644
--- a/client/tests/kvm/kvm_tests.cfg.sample
+++ b/client/tests/kvm/kvm_tests.cfg.sample
@@ -217,7 +217,7 @@ variants:
 image_size_stg = 1G
 remove_image_stg = yes
 force_create_image_stg = yes
-pci_test_cmd = "yes | mke2fs `fdisk -l 2>&1 | awk '/\/dev\/[sv]d[a-z] 
doesn/ {print $2}'`"
+pci_test_cmd = "yes | mke2fs `fdisk -l 2>&1 | awk '/\/dev\/[hsv]d[a-z] 
doesn/ {print $2}'`"
 wait_secs_for_hook_up = 3
 kill_vm_on_error = yes
 variants:
-- 
1.6.5.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/5] KVM test: Major control file cleanup

2009-12-07 Thread Lucas Meneghel Rodrigues

As pointed out before, the KVM reference control files
could use a little clean up. This patch implements severe
cleanup of the main control file by:

* Refactoring the code present there, moving it to the
kvm_utils.py library
* Treat the build test exactly the same way as other
tests, moving the config stuff that used to be in the
control file realm out to its own configuration file,
for the sake of consistency.

This way the control file becomes way shorter, fairly
well organized, and we have a consistent configuration
schema across the board, based on configuration files.

If people are OK with this change, final patch will
change the control.parallel file as well.

2nd try: Implemented pretty much all Michael's suggestions
for this patchset.

Signed-off-by: Lucas Meneghel Rodrigues 
---
 client/tests/kvm/control  |  219 +++--
 client/tests/kvm/kvm_utils.py |   51 ++
 2 files changed, 86 insertions(+), 184 deletions(-)

diff --git a/client/tests/kvm/control b/client/tests/kvm/control
index a526cc0..163286e 100644
--- a/client/tests/kvm/control
+++ b/client/tests/kvm/control
@@ -6,7 +6,7 @@ dh...@redhat.com (David Huff)
 aerom...@redhat.com (Alexey Eromenko)
 mbu...@redhat.com (Mike Burns)
 """
-TIME = 'SHORT'
+TIME = 'MEDIUM'
 NAME = 'KVM test'
 TEST_TYPE = 'client'
 TEST_CLASS = 'Virtualization'
@@ -20,194 +20,45 @@ KVM (both kernelspace and userspace) code.
 For online docs, please refer to http://www.linux-kvm.org/page/KVM-Autotest
 """
 
+import sys, os, logging
+# Add the KVM tests dir to the python path
+kvm_test_dir = os.path.join(os.environ['AUTODIR'],'tests/kvm')
+sys.path.append(kvm_test_dir)
+# Now we can import modules inside the KVM tests dir
+import kvm_utils, kvm_config
 
-import sys, os
-
-#-
 # set English environment (command output might be localized, need to be safe)
-#-
 os.environ['LANG'] = 'en_US.UTF-8'
 
-#-
-# Enable modules import from current directory (tests/kvm)
-#-
-pwd = os.path.join(os.environ['AUTODIR'],'tests/kvm')
-sys.path.append(pwd)
-
-# 
-# create required symlinks
-# 
-# When dispatching tests from autotest-server the links we need do not exist on
-# the host (the client). The following lines create those symlinks. Change
-# 'rootdir' here and/or mount appropriate directories in it.
-#
-# When dispatching tests on local host (client mode) one can either setup kvm
-# links, or same as server mode use rootdir and set all appropriate links and
-# mount-points there. For example, guest installation tests need to know where
-# to find the iso-files.
-#
-# We create the links only if not already exist, so if one already set up the
-# links for client/local run we do not touch the links.
-rootdir='/tmp/kvm_autotest_root'
-iso=os.path.join(rootdir, 'iso')
-images=os.path.join(rootdir, 'images')
-qemu=os.path.join(rootdir, 'qemu')
-qemu_img=os.path.join(rootdir, 'qemu-img')
-
-
-def link_if_not_exist(ldir, target, link_name):
-t = target
-l = os.path.join(ldir, link_name)
-if not os.path.exists(l):
-os.system('ln -s %s %s' % (t, l))
-
-# Create links only if not already exist
-link_if_not_exist(pwd, '../../', 'autotest')
-link_if_not_exist(pwd, iso, 'isos')
-link_if_not_exist(pwd, images, 'images')
-link_if_not_exist(pwd, qemu, 'qemu')
-link_if_not_exist(pwd, qemu_img, 'qemu-img')
-
-# 
-# Params that will be passed to the KVM install/build test
-# 
-params = {
-"name": "build",
-"shortname": "build",
-"type": "build",
-#"mode": "release",
-#"mode": "snapshot",
-#"mode": "localtar",
-#"mode": "localsrc",
-#"mode": "git",
-"mode": "noinstall",
-#"mode": "koji",
-
-## Are we going to load modules built by this test?
-## Defaults to 'yes', so if you are going to provide only userspace code to
-## be built by this test, please set load_modules to 'no', and make sure
-## the kvm and kvm-[vendor] module is already loaded by the time you start
-## it.
-#"load_modules": "no",
-
-## Install from a kvm release ("mode": "release"). You can optionally
-## specify a release tag. If you omit it, the test will get the latest
-## release tag available.
-#"release_tag": '84',
-#"release_dir": 'http://downloads.sourceforge.net/project/kvm/',
-# This is the place that contains the sourceforge project list of files
-#"release_listing": 'http://sourceforge.net/projects/kvm/files/',
-
-## Install from a kvm snapshot location ("mode": "snapshot"). You can
-## optionally specify a snapshot date. If you omit it, the

[PATCH 3/5] KVM test: Verify paths to cdrom and qemu on kvm_preprocessing

2009-12-07 Thread Lucas Meneghel Rodrigues

If paths to CD images and qemu binaries are not correctly
configured, the tests will fail, sometimes giving to
unexperienced users little clue about what is actually
going on. So make sure we verify:

 * ISO paths
 * qemu binary paths

Inside kvm_preprocessing code, and give clear indications
if something goes wrong, asking the user to fix the
configuration problem.

Signed-off-by: Lucas Meneghel Rodrigues 
---
 client/tests/kvm/kvm_preprocessing.py |   48 +
 1 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/client/tests/kvm/kvm_preprocessing.py 
b/client/tests/kvm/kvm_preprocessing.py
index 5bae2bd..85a2d9c 100644
--- a/client/tests/kvm/kvm_preprocessing.py
+++ b/client/tests/kvm/kvm_preprocessing.py
@@ -187,6 +187,29 @@ def preprocess(test, params, env):
 @param params: A dict containing all VM and image parameters.
 @param env: The environment (a dict-like object).
 """
+# Verify if the:
+#  * CD locations
+#  * qemu and qemu-img binaries
+# are valid paths
+needed_paths = [[params.get("cdrom", ""),
+ os.path.join(test.bindir, 'isos')],
+[params.get("qemu_binary", ""), test.bindir],
+[params.get("qemu_img_binary", ""), test.bindir]]
+
+missing_paths = []
+for needed_path, root_dir in needed_paths:
+# If the test doesn't set one of the parameters,
+# just don't check for it.
+if needed_path:
+needed_path = kvm_utils.get_path(root_dir, needed_path)
+if not _is_path_present(needed_path):
+missing_paths.append(needed_path)
+
+if missing_paths:
+raise error.TestError("The following needed paths are missing "
+  "or are broken symbolic links: %s" %
+  missing_paths)
+
 # Start tcpdump if it isn't already running
 if not env.has_key("address_cache"):
 env["address_cache"] = {}
@@ -343,3 +366,28 @@ def _update_address_cache(address_cache, line):
   mac_address, address_cache.get("last_seen"))
 address_cache[mac_address] = address_cache.get("last_seen")
 del address_cache["last_seen"]
+
+
+def _is_path_present(path):
+"""
+Verify whether a given path to a file is present (follows symlinks).
+
+@param path: Path to the file.
+@return: True when the file is present, False when it's not.
+"""
+exists = True
+
+if os.path.islink(path):
+source = os.path.abspath(os.readlink(path))
+if not os.path.isfile(source):
+logging.warning("File %s, needed for this test, "
+"is a broken symbolic link. Please fix your "
+"test configuration." % path)
+exists = False
+elif not os.path.isfile(path):
+logging.warning("File %s, needed for this test, does not exist. "
+"Please fix your test configuration." % path)
+exists = False
+
+return exists
+
-- 
1.6.5.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/5] KVM test: Unattended script: Make qemu_img_bin come from test params

2009-12-07 Thread Lucas Meneghel Rodrigues

Instead of hard coding the path to qemu-img on the
unattended_install script, let's pick it up from the
test parameters.

Signed-off-by: Lucas Meneghel Rodrigues 
---
 client/tests/kvm/scripts/unattended.py |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/client/tests/kvm/scripts/unattended.py 
b/client/tests/kvm/scripts/unattended.py
index 2667649..562d317 100755
--- a/client/tests/kvm/scripts/unattended.py
+++ b/client/tests/kvm/scripts/unattended.py
@@ -53,7 +53,7 @@ class UnattendedInstall(object):
 cdrom_iso = os.environ['KVM_TEST_cdrom']
 self.unattended_file = os.environ['KVM_TEST_unattended_file']
 
-self.qemu_img_bin = os.path.join(kvm_test_dir, 'qemu-img')
+self.qemu_img_bin = os.environ['KVM_TEST_qemu_img_binary']
 self.cdrom_iso = os.path.join(kvm_test_dir, cdrom_iso)
 self.floppy_mount = tempfile.mkdtemp(prefix='floppy_', dir='/tmp')
 self.cdrom_mount = tempfile.mkdtemp(prefix='cdrom_', dir='/tmp')
-- 
1.6.5.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] qemu-kvm: fix ia64 build breakage

2009-12-07 Thread Alexander Graf

Hi Xiantao,

On 12.08.2009, at 06:03, Zhang, Xiantao wrote:

> From 2d3d6cf55f7fecd9a9fd7c764e43b1ee56c7eebb Mon Sep 17 00:00:00 2001
> From: Xiantao Zhang 
> Date: Wed, 12 Aug 2009 11:39:33 +0800
> Subject: [PATCH] qemu-kvm: fix ia64 build breakage
> 
> fix some configure issues.

Do you have any plans to get the IA64 target building for 0.12 again?

Alex
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: KVM: MMU: remove prefault from invlpg handler

2009-12-07 Thread Marcelo Tosatti

On Sat, Dec 05, 2009 at 10:15:44PM +0200, Avi Kivity wrote:
> On 12/05/2009 09:42 PM, Marcelo Tosatti wrote:
>>
>>> I don't think the OS has "other mechanisms", though - the processor can
>>> speculate the tlb so that would be an OS bug.
>>
>> Can it? I figured it relied on the fact that no access (therefore no TLB
>> entry instantiation) meant there is no need to invlpg (since there is
>> nothing in the TLB to invalidate), before updating a particular pte.
>>
>> The documentation states that invlpg invalidates any entries for the
>> linear address.
>>
>
> 4.10.1.3 says, "The processor may cache translations required for  
> prefetches and for accesses that are a result of speculative execution  
> that would never actually occur in the executed code path.", so there is  
> no way for the OS to ensure no access has occurred.  If you change a  
> present pte, you must execute invlpg afterwards to ensure speculation  
> hasn't instantiated the old pte.
>
>
>>> It looks like a race:
>>>
 Signed-off-by: Marcelo Tosatti


 diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
 index a601713..58a0f1e 100644
 --- a/arch/x86/kvm/paging_tmpl.h
 +++ b/arch/x86/kvm/paging_tmpl.h
 @@ -455,8 +455,6 @@ out_unlock:
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
struct kvm_shadow_walk_iterator iterator;
 -  pt_element_t gpte;
 -  gpa_t pte_gpa = -1;
int level;
u64 *sptep;
int need_flush = 0;
 @@ -470,10 +468,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, 
 gva_t gva)
if (level == PT_PAGE_TABLE_LEVEL  ||
((level == PT_DIRECTORY_LEVEL&&   
 is_large_pte(*sptep))) ||
((level == PT_PDPE_LEVEL&&   
 is_large_pte(*sptep {
 -  struct kvm_mmu_page *sp = page_header(__pa(sptep));
 -
 -  pte_gpa = (sp->gfn<<   PAGE_SHIFT);
 -  pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);

if (is_shadow_present_pte(*sptep)) {
rmap_remove(vcpu->kvm, sptep);
 @@ -492,18 +486,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, 
 gva_t gva)
if (need_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
spin_unlock(&vcpu->kvm->mmu_lock);
 -
 -  if (pte_gpa == -1)
 -  return;
 -  if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa,&gpte,
 -sizeof(pt_element_t)))
 -  return;

>>>
>>>
>>> Here, another vcpu updates the gpte and issues a new invlpg.
>>>
>>>
 -  if (is_present_gpte(gpte)&&   (gpte&   PT_ACCESSED_MASK)) {
 -  if (mmu_topup_memory_caches(vcpu))
 -  return;
 -  kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
 -sizeof(pt_element_t), 0);
 -  }

>>>
>>>
>>> And here we undo the correct invlpg with the outdated gpte.
>>>
>>> Looks like we considered this, since kvm_read_guest_atomic() is only
>>> needed if inside the spinlock, but some other change moved the
>>> spin_unlock() upwards.  Will investigate history.
>>
>> Isnt it the OS responsability to serialize pte updates + invlpg between
>> CPUs?
>
> It is.  Do you still have a trace of the error?  Maybe we can understand  
> what the guest thought it was doing.


BAD_POOL_HEADER (19)
The pool is already corrupt at the time of the current request.
This may or may not be due to the caller.
The internal pool links must be walked to figure out a possible cause of
the problem, and then special pool applied to the suspect tags or the driver
verifier to a suspect driver.
Arguments:
Arg1: 0021, the data following the pool block being freed is corrupt.  
Typically this means the consumer (call stack ) has overrun the block.
Arg2: 95424000, The pool pointer being freed.
Arg3: 1010, The number of bytes allocated for the pool block.
Arg4: , The corrupted value found following the pool block.


The BAD_POOL_HEADER BSOD happens at address 0xF8A000DDD000 (complaining it 
contains "00", Arg4).
Walking the pagetables takes to 0x18996 as the pte page:

(qemu) xp 0x18996ee8  (vaddr 0xF8A000DDD000)
18996ee8: 0x153c9963

(qemu) xp 0x18996ef0  (vaddr 0xF8A000DDE000)
18996ef0: 0x1528a963


 qemu-system-x86-13667 [007] 425860.260987: kvm_mmu_pte_write: sp->gfn 18996 
(offset=ef0) gfn 15f11 invlpg=1
 qemu-system-x86-13670 [004] 425860.264977: kvm_mmu_pte_write: sp->gfn 18996 
(offset=ef0) gfn 15253 invlpg=1
 qemu-system-x86-13670 [004] 425860.265039: kvm_mmu_pte_write: sp->gfn 18996 
(offset=ef0) gfn 15f15 invlpg=1
 qemu-system-x86-13670 [004] 425860.266591: kvm_mmu_pte_write: sp->gfn 18996 
(offset=ef0) gfn 146f3 invlpg=1
 qemu-system-x86-13670 [004] 425860.268128: kvm_mm

Re: A few KVM security questions

2009-12-07 Thread Anthony Liguori


Muli Ben-Yehuda wrote:

On Mon, Dec 07, 2009 at 11:38:52AM -0600, Anthony Liguori wrote:

  

I'm skeptical that VT-d in its current form provides protection
against a malicious guest.  The first problem is interrupt delivery.
I don't think any hypervisor has really put much thought into
mitigating interrupt storms as a DoS.  I think there are a number of
nasty things that can be done here.



Seems to me that detecting an interrupt storm and shutting the
offending domain and device off is fairly easy for MSI and MSI-X
interrupts, and not-interesting for legacy INTx interrupts. I don't
know that any hypervisor actually implements it, though.

  

Even if you assume that there aren't flaws in VT-d wrt malicious
guests, we have generations of hardware that have not been designed
to be robust against malicious operating systems.  There are almost
certainly untold numbers of exploitable hardware bugs that can be
used to do all sorts of terrible things to the physical system.



To the device? Undoubtedly. To the host? I'm not so sure.
  


But in the context of SR-IOV, impacting the device may result in 
disrupting (and potentially exploiting) other domains.


And I'm waiting for the "malicious guest sets server on fire" CVE :-)  
I'm convinced there will be at least one.



VT-d protects against DMA access, but there's still plenty of things
a malicious PCI device can do to harm the physical system.  I'm sure
you could easily program a PCI device to flood the bus which
effectively mounts a DoS against other domains.



But is there any way the device could do this and also evade detection
of evade being taken off-line by the host, after first killing its
controlling VM?
  


Thing is, the bus is shared by the host too.  So if the guest is able to 
bring all IO devices on the system to a halt, an administrator certainly 
couldn't connect remotely to take corrective action.


I think all of this could potentially be detected and handled but I 
assume there's years of research here before that's a reality.



There is no mechanism to arbitrate this today.  It's really a
dramatically different model from a security perspective.



I think we need to differentiate between assigning full (legacy)
devices, and assigning an SRIOV VF. In the latter---more
interesting---case, the host remains in control of the overall device,
so shutting off a mis-behaving VF should be simple.
  


SR-IOV is worse IMHO because now there are multiple guests that can be 
impacted by a hardware exploit.


Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

virtio-net offloads not enabled with latest qemu-kvm

2009-12-07 Thread Sridhar Samudrala

With the latest upstream qemu-kvm git tree, all the offloads are disabled
on virtio-net.

peer_has_vnet_hdr(n) in virtio_net_get_features() is failing because
n->vc->peer is NULL. Could not figure out yet why peer field is not initialized.

Do i need any new options to be specified with qemu command?

qemu-system-x86_64 -m 1024 -kernel /boot/vmlinuz-2.6.32-guest -append 
'root=/dev/vda1 console=tty0 console=ttyS0,115200' -initrd 
/boot/initrd-2.6.32-guest.img -drive 
file=/kvm_images/fedora10-2-vm,if=virtio,index=0 -net 
nic,macaddr=54:52:00:35:e3:74,model=virtio -net 
tap,ifname=vnet1,script=no,downscript=no

Works fine with qemu-kvm-0.11.0 and i see checksum/tso/gso are enabled on guest 
virtio-net device.

Thanks
Sridhar

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: A few KVM security questions

2009-12-07 Thread Avi Kivity


On 12/07/2009 07:33 PM, Joanna Rutkowska wrote:



AFAIK VT-d is only supported in Xen for fully virtualized guests.  Maybe
it changed while I wasn't watching, though.

 

Negative. VT-d can be used to contain PV DomUs as well. We actually
verified it.

   


Ah, good for them.


It can use read() and write() (and shared memory) to communicate, just
like Xen stub domains.

 

Well, but the read() and write() syscalls, on a system like Linux, it's
a gate to *lots* of code. These are very powerful system calls.
   


But you control all the file descriptors.  A minimal system would just 
consist of a pair of eventfd fds for signalling and shared memory (the 
Xen equivalent is event channels and grant tables).



It's a lot of surgery, but it can be done.

 

And then you have the code with whom this qemu communicates (e.g. the
network stack). You said we could somehow use IPC to delegate it to some
VM (that would have VT-d assigned NIC). But then this VM would need to
use qemu again (of course this time not for net emulation). Looks
non-trivial.
   


It doesn't really need to be a VM.  Once the seccomp constrained qemu 
processes the guest actions, the result is a fairly simple event stream.



--
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] KVM test: Make sure resources_check use nic_mode=tap

2009-12-07 Thread Lucas Meneghel Rodrigues

nic_mode=tap is required for making physical_resources to work

Signed-off-by: Lucas Meneghel Rodrigues 
---
 client/tests/kvm/kvm_tests.cfg.sample |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/client/tests/kvm/kvm_tests.cfg.sample 
b/client/tests/kvm/kvm_tests.cfg.sample
index 20ae332..e08bca4 100644
--- a/client/tests/kvm/kvm_tests.cfg.sample
+++ b/client/tests/kvm/kvm_tests.cfg.sample
@@ -266,6 +266,7 @@ variants:
 
 - physical_resources_check: install setup unattended_install
 type = physical_resources_check
+nic_mode = tap
 catch_uuid_cmd = dmidecode | awk -F: '/UUID/ {print $2}'
 
 # NICs
-- 
1.6.5.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: A few KVM security questions

2009-12-07 Thread Joanna Rutkowska

Anthony Liguori wrote:
> Joanna Rutkowska wrote:
>> Anthony Liguori wrote:
>>  
>>> Avi Kivity wrote:
>>>
 No.  Paravirtualization just augments the standard hardware interface,
 it doesn't replace it as in Xen.
   
>>> NB, unlike Xen, we can (and do) run qemu as non-root.  Things like
>>> RHEV-H and oVirt constrain the qemu process with SELinux.
>>>
>>> 
>>
>> On Xen you can get rid of the qemu entirely, if you run only PV domains.
>>
>>  
>>> Also, you can use qemu to provide the backends to a Xen PV guest (see -M
>>> xenpv).  The effect is that you are moving that privileged code from the
>>> kernel (netback/blkback) to userspace (qemu -M xenpv).
>>>
>>> In general, KVM tends to keep code in userspace unless absolutely
>>> necessary.  That's a fundamental difference from Xen which tends to do
>>> the opposite.
>>>
>>> 
>>
>> But the difference is that in case of Xen one can *easily* move the
>> backends to small unprivileged VMs. In that case it doesn't matter the
>> code is in kernel mode, it's still only in an unprivileged domain.
>>   
> 
> Right, in KVM, Linux == hypervisor.  A process is our "unprivileged
> domain".  Putting an unprivileged domain within an unprivileged domain
> is probably not helpful from a security perspective since the exposure
> surface is identical.
> 
>> Sandboxing a process in a monolithic OS, like Linux, is generally
>> considered unfeasible, for anything more complex than a hello world
>> program. The process <-> kernel interface seem to be just too fat. See
>> e.g. the recent Linux kernel overflows by Spender.
>>   
> 
> That's the point of mandatory access control.  Of course, you need the
> right policy and Spender highlighted an issue with the standard RHEL
> SELinux policy, but that should be addressed now upstream.
> 
>> Also, SELinux seems to me like a step into the wrong direction. It not
>> only adds complexity to the already-too-complex kernel, but requires
>> complex configuration. See e.g. this paper[1] for a nice example of how
>> to escape SE-sandboxed qemu on FC8 due to SELinux policy
>> misconfiguration.
>>
>> When some people tried to add SELinux-like-thing to Xen hypervisor, it
>> only resulted in an exploitable heap overflow in Xen [2].
>>   
> 
> It's certainly fair to argue the merits of SELinux as a mandatory access
> control mechanism.
> 
> Again though, that's the point of MLS.  Our first line of defense is
> qemu.  Our second line of defense is traditional Posix direct access
> control.  Our third line of defense is namespace isolation (ala lxc). 
> Our fourth line of defense is mandatory access control (ala SELinux and
> AppArmor).
> 
> If you take a somewhat standard deployment like RHEV-H, an awful lot of
> things have to go wrong before you can successfully exploit the system. 
> And 5.4 doesn't even implement all of what's possible.  If you're really
> looking to harden, you can be much more aggressive about privileges and
> namespace isolation.
> 

I think this ultimately comes down to the question: is the
built-from-scratch minimal PV interface (as in Xen) more secure than the
Linux's fat-but-sandboxed interface?

joanna.



signature.asc
Description: OpenPGP digital signature

Test failures during git daily testing

2009-12-07 Thread Lucas Meneghel Rodrigues

Hi Folks:

Today pretty much all install tests for kvm and qemu upstream git
failed. the vm screen says something along the lines:

Starting SeaBIOS [version-string]
No bootable device.

Screenshot attached. The command line the test used:

/usr/local/autotest/tests/kvm/qemu -name 'vm1' -monitor 
unix:/tmp/monitor-20091207-120625-tyjI,server,nowait -drive 
file=/usr/local/autotest/tests/kvm/images/fc11-32.qcow2,if=ide -net nic,vlan=0 
-net user,vlan=0 -m 512 -smp 1 -cdrom 
/usr/local/autotest/tests/kvm/isos/linux/Fedora-11-i386-DVD.iso -fda 
/usr/local/autotest/tests/kvm/images/floppy.img -tftp 
/usr/local/autotest/tests/kvm/images/tftpboot  -boot d -bootp /pxelinux.0 -boot 
n -redir tcp:5000::22 -vnc :0

There's a concern that the command that was used might no longer be valid. If 
that's the case, please advise.

Lucas
<>

Re: A few KVM security questions

2009-12-07 Thread Daniel P. Berrange

On Mon, Dec 07, 2009 at 06:09:55PM +0100, Joanna Rutkowska wrote:
> 
> Also, SELinux seems to me like a step into the wrong direction. It not
> only adds complexity to the already-too-complex kernel, but requires
> complex configuration. See e.g. this paper[1] for a nice example of how
> to escape SE-sandboxed qemu on FC8 due to SELinux policy misconfiguration.

Things have changed alot since the time the that Xen SELinux policy was
written. The Xen policy was always a tradeoff between usability & security
sine the XenD managment tools were playing no part in the configuration,
leaving it upto the administrator. With KVM  & SELinx, the management tools
play an active part in configuration, removing this burden from the
adminsitrator. Each VM runs under a SELinux context with a dedicated MLS
category, and the resources the VM is assigned have their labelling set
to match. The guest policy only allows it access to resources with a
matching MLS level, so it not gain access to anything the administrator
has not explicitly granted in the VM's configuration. This is actually 
simpler for administrators, since they no longer need to manage labelling 
themselves, while offering greater protection between VMs which was also
not possible with the old Xen policy

Regards,
Daniel
-- 
|: Red Hat, Engineering, London   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org   -o- http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: A few KVM security questions

2009-12-07 Thread Joanna Rutkowska

Anthony Liguori wrote:
> Joanna Rutkowska wrote:
>> Avi Kivity wrote:
>>  
>>> On 12/07/2009 07:09 PM, Joanna Rutkowska wrote:
>>>
> Also, you can use qemu to provide the backends to a Xen PV guest
> (see -M
> xenpv).  The effect is that you are moving that privileged code
> from the
> kernel (netback/blkback) to userspace (qemu -M xenpv).
>
> In general, KVM tends to keep code in userspace unless absolutely
> necessary.  That's a fundamental difference from Xen which tends to do
> the opposite.
>
>  
 But the difference is that in case of Xen one can *easily* move the
 backends to small unprivileged VMs. In that case it doesn't matter the
 code is in kernel mode, it's still only in an unprivileged domain.

  
>>> They're not really unprivileged, one can easily program the dma
>>> controller of their assigned pci card to read and write arbitrary host
>>> memory.
>>>
>>> 
>>
>> That's not true if you use VT-d.
>>   
> 
> I'm skeptical that VT-d in its current form provides protection against
> a malicious guest.  The first problem is interrupt delivery.  I don't
> think any hypervisor has really put much thought into mitigating
> interrupt storms as a DoS.  I think there are a number of nasty things
> that can be done here.
> 

Intel VT-d v1 doesn't support interrupt remapping, so I'm sure you're
right here. But DoS attack is a different thing then a system subversion
(think malware) attack. Of course which one you fear more would depend
on your threat model.

> Even if you assume that there aren't flaws in VT-d wrt malicious guests,
> we have generations of hardware that have not been designed to be robust
> against malicious operating systems.  There are almost certainly untold
> numbers of exploitable hardware bugs that can be used to do all sorts of
> terrible things to the physical system.
> 

Perhaps, although so far nobody presented a software-only VT-d escape
attack. I think it's reasonable to assume some maniacs would discover a
one or two in the coming years. Still, probably order of magnitude less
likely than a Linux kernel overflow.

> VT-d protects against DMA access, but there's still plenty of things a
> malicious PCI device can do to harm the physical system.  I'm sure you
> could easily program a PCI device to flood the bus which effectively
> mounts a DoS against other domains.  There is no mechanism to arbitrate
> this today.  It's really a dramatically different model from a security
> perspective.
> 

Agree, there are lots of DoS possibilities. It's just that for me,
personally, they are not in the threat model.

joanna.



signature.asc
Description: OpenPGP digital signature

Re: A few KVM security questions

2009-12-07 Thread Anthony Liguori


Joanna Rutkowska wrote:

Avi Kivity wrote:
  

On 12/07/2009 07:09 PM, Joanna Rutkowska wrote:


Also, you can use qemu to provide the backends to a Xen PV guest (see -M
xenpv).  The effect is that you are moving that privileged code from the
kernel (netback/blkback) to userspace (qemu -M xenpv).

In general, KVM tends to keep code in userspace unless absolutely
necessary.  That's a fundamental difference from Xen which tends to do
the opposite.

 


But the difference is that in case of Xen one can *easily* move the
backends to small unprivileged VMs. In that case it doesn't matter the
code is in kernel mode, it's still only in an unprivileged domain.

   
  

They're not really unprivileged, one can easily program the dma
controller of their assigned pci card to read and write arbitrary host
memory.




That's not true if you use VT-d.
  


I'm skeptical that VT-d in its current form provides protection against 
a malicious guest.  The first problem is interrupt delivery.  I don't 
think any hypervisor has really put much thought into mitigating 
interrupt storms as a DoS.  I think there are a number of nasty things 
that can be done here.


Even if you assume that there aren't flaws in VT-d wrt malicious guests, 
we have generations of hardware that have not been designed to be robust 
against malicious operating systems.  There are almost certainly untold 
numbers of exploitable hardware bugs that can be used to do all sorts of 
terrible things to the physical system.


VT-d protects against DMA access, but there's still plenty of things a 
malicious PCI device can do to harm the physical system.  I'm sure you 
could easily program a PCI device to flood the bus which effectively 
mounts a DoS against other domains.  There is no mechanism to arbitrate 
this today.  It's really a dramatically different model from a security 
perspective.


Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: A few KVM security questions

2009-12-07 Thread Joanna Rutkowska

Avi Kivity wrote:
> On 12/07/2009 07:15 PM, Joanna Rutkowska wrote:

 But the difference is that in case of Xen one can *easily* move the
 backends to small unprivileged VMs. In that case it doesn't matter the
 code is in kernel mode, it's still only in an unprivileged domain.

>>> They're not really unprivileged, one can easily program the dma
>>> controller of their assigned pci card to read and write arbitrary host
>>> memory.
>>>
>>>  
>> That's not true if you use VT-d.
>>
> 
> AFAIK VT-d is only supported in Xen for fully virtualized guests.  Maybe
> it changed while I wasn't watching, though.
> 

Negative. VT-d can be used to contain PV DomUs as well. We actually
verified it.

 Sandboxing a process in a monolithic OS, like Linux, is generally
 considered unfeasible, for anything more complex than a hello world
 program. The process<->   kernel interface seem to be just too fat. See
 e.g. the recent Linux kernel overflows by Spender.

>>> What about seccomp?  You can easily simplify qemu to just a bunch of
>>> calculations served over a pipe.
>>>
>>>  
>> But the qemu must somehow communicate with the external world too, no?
>> You said you provide e.g. net backend via the qemu process...
>>
> 
> It can use read() and write() (and shared memory) to communicate, just
> like Xen stub domains.
> 

Well, but the read() and write() syscalls, on a system like Linux, it's
a gate to *lots* of code. These are very powerful system calls.

> It's a lot of surgery, but it can be done.
> 

And then you have the code with whom this qemu communicates (e.g. the
network stack). You said we could somehow use IPC to delegate it to some
VM (that would have VT-d assigned NIC). But then this VM would need to
use qemu again (of course this time not for net emulation). Looks
non-trivial.

joanna.

signature.asc
Description: OpenPGP digital signature

Re: A few KVM security questions

2009-12-07 Thread Anthony Liguori


Joanna Rutkowska wrote:

Anthony Liguori wrote:
  

Avi Kivity wrote:


No.  Paravirtualization just augments the standard hardware interface,
it doesn't replace it as in Xen.
  

NB, unlike Xen, we can (and do) run qemu as non-root.  Things like
RHEV-H and oVirt constrain the qemu process with SELinux.




On Xen you can get rid of the qemu entirely, if you run only PV domains.

  

Also, you can use qemu to provide the backends to a Xen PV guest (see -M
xenpv).  The effect is that you are moving that privileged code from the
kernel (netback/blkback) to userspace (qemu -M xenpv).

In general, KVM tends to keep code in userspace unless absolutely
necessary.  That's a fundamental difference from Xen which tends to do
the opposite.




But the difference is that in case of Xen one can *easily* move the
backends to small unprivileged VMs. In that case it doesn't matter the
code is in kernel mode, it's still only in an unprivileged domain.
  


Right, in KVM, Linux == hypervisor.  A process is our "unprivileged 
domain".  Putting an unprivileged domain within an unprivileged domain 
is probably not helpful from a security perspective since the exposure 
surface is identical.



Sandboxing a process in a monolithic OS, like Linux, is generally
considered unfeasible, for anything more complex than a hello world
program. The process <-> kernel interface seem to be just too fat. See
e.g. the recent Linux kernel overflows by Spender.
  


That's the point of mandatory access control.  Of course, you need the 
right policy and Spender highlighted an issue with the standard RHEL 
SELinux policy, but that should be addressed now upstream.



Also, SELinux seems to me like a step into the wrong direction. It not
only adds complexity to the already-too-complex kernel, but requires
complex configuration. See e.g. this paper[1] for a nice example of how
to escape SE-sandboxed qemu on FC8 due to SELinux policy misconfiguration.

When some people tried to add SELinux-like-thing to Xen hypervisor, it
only resulted in an exploitable heap overflow in Xen [2].
  


It's certainly fair to argue the merits of SELinux as a mandatory access 
control mechanism.


Again though, that's the point of MLS.  Our first line of defense is 
qemu.  Our second line of defense is traditional Posix direct access 
control.  Our third line of defense is namespace isolation (ala lxc).  
Our fourth line of defense is mandatory access control (ala SELinux and 
AppArmor).


If you take a somewhat standard deployment like RHEV-H, an awful lot of 
things have to go wrong before you can successfully exploit the system.  
And 5.4 doesn't even implement all of what's possible.  If you're really 
looking to harden, you can be much more aggressive about privileges and 
namespace isolation.


Regards,

Anthony Liguori

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: A few KVM security questions

2009-12-07 Thread Avi Kivity


On 12/07/2009 07:15 PM, Joanna Rutkowska wrote:


But the difference is that in case of Xen one can *easily* move the
backends to small unprivileged VMs. In that case it doesn't matter the
code is in kernel mode, it's still only in an unprivileged domain.


   

They're not really unprivileged, one can easily program the dma
controller of their assigned pci card to read and write arbitrary host
memory.

 

That's not true if you use VT-d.
   


AFAIK VT-d is only supported in Xen for fully virtualized guests.  Maybe 
it changed while I wasn't watching, though.



Sandboxing a process in a monolithic OS, like Linux, is generally
considered unfeasible, for anything more complex than a hello world
program. The process<->   kernel interface seem to be just too fat. See
e.g. the recent Linux kernel overflows by Spender.

   

What about seccomp?  You can easily simplify qemu to just a bunch of
calculations served over a pipe.

 

But the qemu must somehow communicate with the external world too, no?
You said you provide e.g. net backend via the qemu process...
   


It can use read() and write() (and shared memory) to communicate, just 
like Xen stub domains.


It's a lot of surgery, but it can be done.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: A few KVM security questions

2009-12-07 Thread Joanna Rutkowska

Avi Kivity wrote:
> On 12/07/2009 07:09 PM, Joanna Rutkowska wrote:
>>
>>> Also, you can use qemu to provide the backends to a Xen PV guest (see -M
>>> xenpv).  The effect is that you are moving that privileged code from the
>>> kernel (netback/blkback) to userspace (qemu -M xenpv).
>>>
>>> In general, KVM tends to keep code in userspace unless absolutely
>>> necessary.  That's a fundamental difference from Xen which tends to do
>>> the opposite.
>>>
>>>  
>> But the difference is that in case of Xen one can *easily* move the
>> backends to small unprivileged VMs. In that case it doesn't matter the
>> code is in kernel mode, it's still only in an unprivileged domain.
>>
>>
> 
> They're not really unprivileged, one can easily program the dma
> controller of their assigned pci card to read and write arbitrary host
> memory.
> 

That's not true if you use VT-d.

>> Sandboxing a process in a monolithic OS, like Linux, is generally
>> considered unfeasible, for anything more complex than a hello world
>> program. The process<->  kernel interface seem to be just too fat. See
>> e.g. the recent Linux kernel overflows by Spender.
>>
> 
> What about seccomp?  You can easily simplify qemu to just a bunch of
> calculations served over a pipe.
> 
But the qemu must somehow communicate with the external world too, no?
You said you provide e.g. net backend via the qemu process...

joanna.



signature.asc
Description: OpenPGP digital signature

Re: A few KVM security questions

2009-12-07 Thread Avi Kivity


On 12/07/2009 07:09 PM, Joanna Rutkowska wrote:



Also, you can use qemu to provide the backends to a Xen PV guest (see -M
xenpv).  The effect is that you are moving that privileged code from the
kernel (netback/blkback) to userspace (qemu -M xenpv).

In general, KVM tends to keep code in userspace unless absolutely
necessary.  That's a fundamental difference from Xen which tends to do
the opposite.

 

But the difference is that in case of Xen one can *easily* move the
backends to small unprivileged VMs. In that case it doesn't matter the
code is in kernel mode, it's still only in an unprivileged domain.

   


They're not really unprivileged, one can easily program the dma 
controller of their assigned pci card to read and write arbitrary host 
memory.



Sandboxing a process in a monolithic OS, like Linux, is generally
considered unfeasible, for anything more complex than a hello world
program. The process<->  kernel interface seem to be just too fat. See
e.g. the recent Linux kernel overflows by Spender.
   


What about seccomp?  You can easily simplify qemu to just a bunch of 
calculations served over a pipe.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: A few KVM security questions

2009-12-07 Thread Joanna Rutkowska

Anthony Liguori wrote:
> Avi Kivity wrote:
>> No.  Paravirtualization just augments the standard hardware interface,
>> it doesn't replace it as in Xen.
> 
> NB, unlike Xen, we can (and do) run qemu as non-root.  Things like
> RHEV-H and oVirt constrain the qemu process with SELinux.
> 

On Xen you can get rid of the qemu entirely, if you run only PV domains.

> Also, you can use qemu to provide the backends to a Xen PV guest (see -M
> xenpv).  The effect is that you are moving that privileged code from the
> kernel (netback/blkback) to userspace (qemu -M xenpv).
> 
> In general, KVM tends to keep code in userspace unless absolutely
> necessary.  That's a fundamental difference from Xen which tends to do
> the opposite.
> 

But the difference is that in case of Xen one can *easily* move the
backends to small unprivileged VMs. In that case it doesn't matter the
code is in kernel mode, it's still only in an unprivileged domain.

Sandboxing a process in a monolithic OS, like Linux, is generally
considered unfeasible, for anything more complex than a hello world
program. The process <-> kernel interface seem to be just too fat. See
e.g. the recent Linux kernel overflows by Spender.

Also, SELinux seems to me like a step into the wrong direction. It not
only adds complexity to the already-too-complex kernel, but requires
complex configuration. See e.g. this paper[1] for a nice example of how
to escape SE-sandboxed qemu on FC8 due to SELinux policy misconfiguration.

When some people tried to add SELinux-like-thing to Xen hypervisor, it
only resulted in an exploitable heap overflow in Xen [2].

[1] http://invisiblethingslab.com/resources/misc08/xenfb-adventures-10.pdf

[2] http://invisiblethingslab.com/resources/bh08/part2-full.pdf

joanna.

signature.asc
Description: OpenPGP digital signature

Re: A few KVM security questions

2009-12-07 Thread Anthony Liguori


Joanna Rutkowska wrote:

Avi Kivity wrote:
  

On 12/07/2009 03:05 PM, Joanna Rutkowska wrote:


In particular, is
it possible to move the qemu from the host to one of the VMs? Perhaps to
have a separate copy of qemu for each VM? (ala Xen's stub-domains)
   
  

It should be fairly easy to place qemu in a guest.  You would leave a
simple program on the host to communicate with kvm and pass any data
written by the guest to qemu running in another guest, and feed any
replies back to the guest.




But then you would need to have another qemu (on the host) to support
running this "qemu-VM", where we want to put the qemu, right?
  


It really offers no advantage.  The security assumption should be that a 
guest can break into qemu.  If a guest can break out of qemu, putting it 
in another qemu means that we still need to assume it can break out of 
that qemu.  The host should treat the qemu process as hostile and 
constrain it by using things like -runas, -chroot, SELinux, and 
containers.  This is what most production systems do today.  libvirt 
certainly takes this approach.


That's not to say that we know for sure that a guest can break into 
qemu, but designing around that assumption gives us MLS.


Regards,

Anthony Liguori

joanna.

  


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: A few KVM security questions

2009-12-07 Thread Anthony Liguori


Avi Kivity wrote:
No.  Paravirtualization just augments the standard hardware interface, 
it doesn't replace it as in Xen.


NB, unlike Xen, we can (and do) run qemu as non-root.  Things like 
RHEV-H and oVirt constrain the qemu process with SELinux.


Also, you can use qemu to provide the backends to a Xen PV guest (see -M 
xenpv).  The effect is that you are moving that privileged code from the 
kernel (netback/blkback) to userspace (qemu -M xenpv).


In general, KVM tends to keep code in userspace unless absolutely 
necessary.  That's a fundamental difference from Xen which tends to do 
the opposite.


Regards,

Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Endless loop in qcow2_alloc_cluster_offset

2009-12-07 Thread Kevin Wolf

Am 07.12.2009 17:09, schrieb Jan Kiszka:
> Kevin Wolf wrote:
>> In qcow_aio_write_cb there isn't much happening between these calls. The
>> only thing that could somehow become dangerous is the
>> qcow_aio_write_cb(req, 0); for queued requests in run_dependent_requests.
> 
> If m->nb_clusters is not, the entry won't be removed from the list. And
> of something corrupted nb_clusters so that it became 0 although it's
> still enqueued, we would see the deadly loop I faced, right?
> Unfortunately, any arbitrary memory corruption that generates such zeros
> can cause this...

Right, this looks like another way to get into that endless loop. I
don't think it's very likely the cause, but who knows.

Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Endless loop in qcow2_alloc_cluster_offset

2009-12-07 Thread Jan Kiszka

Kevin Wolf wrote:
> Am 07.12.2009 15:16, schrieb Jan Kiszka:
>>> Likely not. What I did was nothing special, and I did not noticed such a
>>> crash in the last months.
>> And now it happened again (qemu-kvm head, during kernel installation
>> from network onto local qcow2-disk). Any clever idea how to proceed with
>> this?
> 
> I still haven't seen this and I still have no theory on what could be
> happening here. I'm just trying to write down what I think must happen
> to get into this situation. Maybe you can point at something I'm missing
> or maybe it helps you to have a sudden inspiration.
> 
> The crash happens because we have a loop in the s->cluster_allocs list.
> A loop can only be created by inserting an object twice. The only insert
> to this list happens in qcow2_alloc_cluster_offset (though an earlier
> call than that of the stack trace).
> 
> There is only one relevant caller of this function, qcow_aio_write_cb.
> Part of it is a call to run_dependent_requests which removes the request
> from s->cluster_allocs. So after the QLIST_REMOVE in
> run_dependent_requests the request can't be contained in the list, but
> at the call of qcow2_alloc_cluster_offset it must be contained again. It
> must be added somewhere in between these two calls.
> 
> In qcow_aio_write_cb there isn't much happening between these calls. The
> only thing that could somehow become dangerous is the
> qcow_aio_write_cb(req, 0); for queued requests in run_dependent_requests.

If m->nb_clusters is not, the entry won't be removed from the list. And
of something corrupted nb_clusters so that it became 0 although it's
still enqueued, we would see the deadly loop I faced, right?
Unfortunately, any arbitrary memory corruption that generates such zeros
can cause this...

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: MSRs load/store

2009-12-07 Thread Avi Kivity


On 12/07/2009 05:32 PM, Jiaqing Du wrote:

Hi Avi,

I did not get your point.

But if we want to multiplex some of the MSRs across the VMM and the
guest(s), it would be handy if the hardware provides this feature:
save host's version and load guest's version. Of course, we can do
this manually. I'm just wondering why this feature is missing.
   


Well, you'll have to ask the designers of the feature.  If it can be 
done manually, why add a feature in hardware?


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: MSRs load/store

2009-12-07 Thread Jiaqing Du

Hi Avi,

I did not get your point.

But if we want to multiplex some of the MSRs across the VMM and the
guest(s), it would be handy if the hardware provides this feature:
save host's version and load guest's version. Of course, we can do
this manually. I'm just wondering why this feature is missing.


Thanks,
Jiaqing

2009/12/7 Avi Kivity :
> On 12/07/2009 05:07 PM, Jiaqing Du wrote:
>>
>> Hi List,
>>
>> My question is about VM-Exit&  VM-Entry controls for MSRs on Intel's
>> processors.
>>
>> For VM-Exit, a VMM can specify lists of MSRs to be stored and loaded
>> on VM exits. But for VM-Entry, a VMM can only specify a list of MSRs
>> to be loaded on VM entries. Why does not the processor have the
>> feature that stores MSRs before loading new ones for VM entries?
>>
>
> Presumably the host knows what values are in those MSRs, so it doesn't need
> to store them.
>
> --
> error compiling committee.c: too many arguments to function
>
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Endless loop in qcow2_alloc_cluster_offset

2009-12-07 Thread Jan Kiszka

Kevin Wolf wrote:
> Am 07.12.2009 15:50, schrieb Jan Kiszka:
>> Jan Kiszka wrote:
>>> And now it happened again (qemu-kvm head, during kernel installation
>>> from network onto local qcow2-disk). Any clever idea how to proceed with
>>> this?
>>>
>>> I could try to run the step in a loop, hopefully retriggering it once in
>>> a (likely longer) while. But then we need some good instrumentation first.
>>>
>> Maybe I'm seeing ghosts, and I don't even have a minimal clue about what
>> goes on in the code, but this looks fishy:
>>
>> preallocate() invokes qcow2_alloc_cluster_offset() passing &meta, a
>> stack variable. It seems that qcow2_alloc_cluster_offset() may insert
>> this structure into cluster_allocs and leave it there. So we corrupt the
>> queue as soon as preallocate() returns, no?
> 
> preallocate() is about metadata preallocation during image creation. It
> is only ever run by qemu-img. Apart from that it calls
> run_dependent_requests() which removes the request from the list again.

OK, I see - was far too easy anyway.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: MSRs load/store

2009-12-07 Thread Avi Kivity


On 12/07/2009 05:07 PM, Jiaqing Du wrote:

Hi List,

My question is about VM-Exit&  VM-Entry controls for MSRs on Intel's processors.

For VM-Exit, a VMM can specify lists of MSRs to be stored and loaded
on VM exits. But for VM-Entry, a VMM can only specify a list of MSRs
to be loaded on VM entries. Why does not the processor have the
feature that stores MSRs before loading new ones for VM entries?
   


Presumably the host knows what values are in those MSRs, so it doesn't 
need to store them.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

MSRs load/store

2009-12-07 Thread Jiaqing Du

Hi List,

My question is about VM-Exit & VM-Entry controls for MSRs on Intel's processors.

For VM-Exit, a VMM can specify lists of MSRs to be stored and loaded
on VM exits. But for VM-Entry, a VMM can only specify a list of MSRs
to be loaded on VM entries. Why does not the processor have the
feature that stores MSRs before loading new ones for VM entries?


Thanks,
Jiaqing
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Endless loop in qcow2_alloc_cluster_offset

2009-12-07 Thread Kevin Wolf

Am 07.12.2009 15:50, schrieb Jan Kiszka:
> Jan Kiszka wrote:
>> And now it happened again (qemu-kvm head, during kernel installation
>> from network onto local qcow2-disk). Any clever idea how to proceed with
>> this?
>>
>> I could try to run the step in a loop, hopefully retriggering it once in
>> a (likely longer) while. But then we need some good instrumentation first.
>>
> 
> Maybe I'm seeing ghosts, and I don't even have a minimal clue about what
> goes on in the code, but this looks fishy:
> 
> preallocate() invokes qcow2_alloc_cluster_offset() passing &meta, a
> stack variable. It seems that qcow2_alloc_cluster_offset() may insert
> this structure into cluster_allocs and leave it there. So we corrupt the
> queue as soon as preallocate() returns, no?

preallocate() is about metadata preallocation during image creation. It
is only ever run by qemu-img. Apart from that it calls
run_dependent_requests() which removes the request from the list again.

Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Endless loop in qcow2_alloc_cluster_offset

2009-12-07 Thread Avi Kivity


On 12/07/2009 04:50 PM, Jan Kiszka wrote:


Maybe I'm seeing ghosts, and I don't even have a minimal clue about what
goes on in the code, but this looks fishy:

   


Plenty of ghosts in qcow2, of all those explorers who tried to brave the 
code.  Only Kevin has ever come back.



preallocate() invokes qcow2_alloc_cluster_offset() passing&meta, a
stack variable. It seems that qcow2_alloc_cluster_offset() may insert
this structure into cluster_allocs and leave it there. So we corrupt the
queue as soon as preallocate() returns, no?

   


We invoke run_dependent_requests() which should dequeue those &meta 
again (I think).


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Endless loop in qcow2_alloc_cluster_offset

2009-12-07 Thread Kevin Wolf

Am 07.12.2009 15:16, schrieb Jan Kiszka:
>> Likely not. What I did was nothing special, and I did not noticed such a
>> crash in the last months.
> 
> And now it happened again (qemu-kvm head, during kernel installation
> from network onto local qcow2-disk). Any clever idea how to proceed with
> this?

I still haven't seen this and I still have no theory on what could be
happening here. I'm just trying to write down what I think must happen
to get into this situation. Maybe you can point at something I'm missing
or maybe it helps you to have a sudden inspiration.

The crash happens because we have a loop in the s->cluster_allocs list.
A loop can only be created by inserting an object twice. The only insert
to this list happens in qcow2_alloc_cluster_offset (though an earlier
call than that of the stack trace).

There is only one relevant caller of this function, qcow_aio_write_cb.
Part of it is a call to run_dependent_requests which removes the request
from s->cluster_allocs. So after the QLIST_REMOVE in
run_dependent_requests the request can't be contained in the list, but
at the call of qcow2_alloc_cluster_offset it must be contained again. It
must be added somewhere in between these two calls.

In qcow_aio_write_cb there isn't much happening between these calls. The
only thing that could somehow become dangerous is the
qcow_aio_write_cb(req, 0); for queued requests in run_dependent_requests.

> I could try to run the step in a loop, hopefully retriggering it once in
> a (likely longer) while. But then we need some good instrumentation first.

I can't explain what exactly would be going wrong there, but if my
thoughts are right so far, I think that moving this into a Bottom Half
would help. So if you can reproduce it in a loop this could be worth a try.

I'd certainly prefer to understand the problem first, but thinking about
AIO is the perfect way to make your brain hurt...

Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Endless loop in qcow2_alloc_cluster_offset

2009-12-07 Thread Jan Kiszka

Jan Kiszka wrote:
> And now it happened again (qemu-kvm head, during kernel installation
> from network onto local qcow2-disk). Any clever idea how to proceed with
> this?
> 
> I could try to run the step in a loop, hopefully retriggering it once in
> a (likely longer) while. But then we need some good instrumentation first.
> 

Maybe I'm seeing ghosts, and I don't even have a minimal clue about what
goes on in the code, but this looks fishy:

preallocate() invokes qcow2_alloc_cluster_offset() passing &meta, a
stack variable. It seems that qcow2_alloc_cluster_offset() may insert
this structure into cluster_allocs and leave it there. So we corrupt the
queue as soon as preallocate() returns, no?

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Endless loop in qcow2_alloc_cluster_offset

2009-12-07 Thread Jan Kiszka

Jan Kiszka wrote:
> Kevin Wolf wrote:
>> Hi Jan,
>>
>> Am 19.11.2009 13:19, schrieb Jan Kiszka:
>>> (gdb) print ((BDRVQcowState *)bs->opaque)->cluster_allocs.lh_first 
>>> $5 = (struct QCowL2Meta *) 0xcb3568
>>> (gdb) print *((BDRVQcowState *)bs->opaque)->cluster_allocs.lh_first 
>>> $6 = {offset = 7417176064, n_start = 0, nb_available = 16, nb_clusters = 0, 
>>> depends_on = 0xcb3568, dependent_requests = {lh_first = 0x0}, 
>>> next_in_flight = {le_next = 0xcb3568, le_prev = 0xc4ebd8}}
>>>
>>> So next == first.
>> Oops. Doesn't sound quite right...
>>
>>> Is something fiddling with cluster_allocs concurrently, e.g. some signal
>>> handler? Or what could cause this list corruption? Would it be enough to
>>> move to QLIST_FOREACH_SAFE?
>> Are there any specific signals you're thinking of? Related to block code
> 
> No, was just blind guessing.
> 
>> I can only think of SIGUSR2 and this one shouldn't call any block driver
>> functions directly. You're using aio=threads, I assume? (It's the default)
> 
> Yes, all on defaults.
> 
>> QLIST_FOREACH_SAFE shouldn't make a difference in this place as the loop
>> doesn't insert or remove any elements. If the list is corrupted now, I
>> think it would be corrupted with QLIST_FOREACH_SAFE as well - at best,
>> the endless loop would occur one call later.
>>
>> The only way I see to get such a loop in a list is to re-insert an
>> element that already is part of the list. The only insert is at
>> qcow2-cluster.c:777. Remains the question how we came there twice
>> without run_dependent_requests() removing the L2Meta from our list first
>> - because this is definitely wrong...
>>
>> Presumably, it's not reproducible?
> 
> Likely not. What I did was nothing special, and I did not noticed such a
> crash in the last months.

And now it happened again (qemu-kvm head, during kernel installation
from network onto local qcow2-disk). Any clever idea how to proceed with
this?

I could try to run the step in a loop, hopefully retriggering it once in
a (likely longer) while. But then we need some good instrumentation first.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: A few KVM security questions

2009-12-07 Thread Avi Kivity


On 12/07/2009 04:06 PM, Joanna Rutkowska wrote:


Can you point to a document/source file that would list all the possible
interfaces between VM and the host? I.e. all the VMX handlers, and all
the hypercalls (PV interfaces).
   


arch/x86/kvm/vmx.c is the entry point for all interaction, but it 
quickly diverges.  arch/x86/include/asm/kvm_para.h is the pv interface.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: A few KVM security questions

2009-12-07 Thread Joanna Rutkowska

Avi Kivity wrote:
> On 12/07/2009 03:30 PM, Joanna Rutkowska wrote:
>> Avi Kivity wrote:
>>
>>   
 1) Do you have any support for para-virtualized VMs?

>>> Yes, for example, we support paravirtualized timers and mmu for Linux.
>>> These are fairly minimal compared to Xen's pv domains.
>>>
>>>  
>> Can I run a regular Linux as PV-guest? Specifically, can I get rid of
>> qemu totally, assuming I have only PV guests?
>>
>>
> 
> No.  Paravirtualization just augments the standard hardware interface,
> it doesn't replace it as in Xen.
> 

Can you point to a document/source file that would list all the possible
interfaces between VM and the host? I.e. all the VMX handlers, and all
the hypercalls (PV interfaces).

joanna.



signature.asc
Description: OpenPGP digital signature

Re: A few KVM security questions

2009-12-07 Thread Avi Kivity


On 12/07/2009 03:55 PM, Joanna Rutkowska wrote:



It should be fairly easy to place qemu in a guest.  You would leave a
simple program on the host to communicate with kvm and pass any data
written by the guest to qemu running in another guest, and feed any
replies back to the guest.

 

But then you would need to have another qemu (on the host) to support
running this "qemu-VM", where we want to put the qemu, right?
   


Right, but to exploit this, you'd have to exploit the internal qemu, 
exploit the kernel, and exploit the external qemu.  Well, if the exploit 
was in some central thing I guess there isn't much value in the nesting.


You could alternatively use Xenner to run Xen guests for your qemu.  
That emulates a lot less.  But AFAIK xenner is moving towards qemu as well.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: A few KVM security questions

2009-12-07 Thread Joanna Rutkowska

Avi Kivity wrote:
> On 12/07/2009 03:05 PM, Joanna Rutkowska wrote:
>> In particular, is
>> it possible to move the qemu from the host to one of the VMs? Perhaps to
>> have a separate copy of qemu for each VM? (ala Xen's stub-domains)
>>
> 
> It should be fairly easy to place qemu in a guest.  You would leave a
> simple program on the host to communicate with kvm and pass any data
> written by the guest to qemu running in another guest, and feed any
> replies back to the guest.
> 

But then you would need to have another qemu (on the host) to support
running this "qemu-VM", where we want to put the qemu, right?

joanna.



signature.asc
Description: OpenPGP digital signature

Re: A few KVM security questions

2009-12-07 Thread Avi Kivity


On 12/07/2009 03:30 PM, Joanna Rutkowska wrote:

Avi Kivity wrote:

   

1) Do you have any support for para-virtualized VMs?
   

Yes, for example, we support paravirtualized timers and mmu for Linux.
These are fairly minimal compared to Xen's pv domains.

 

Can I run a regular Linux as PV-guest? Specifically, can I get rid of
qemu totally, assuming I have only PV guests?

   


No.  Paravirtualization just augments the standard hardware interface, 
it doesn't replace it as in Xen.



E.g. do you have PV network and disk frontends (PV drivers), that I
could use on guests and that do not use qemu at all?
   


We do have PV network and disk frontends, but the backends (devices) are 
still in qemu.



Should be doable by assigning the NIC to a driver domain and bridging it
to a virtio driver; then have the driver domain's virtio device talk to
the ordinary guests.
 

But bridging would still require to have some networking support (+net
backends) on the host (sure, without any real NIC driver, but still),
correct?
   


If you were willing to hack a bit, you can use any IPC to pass the 
packets instead of the networking stack (for example, shared memory + 
eventfd for signalling).



4) Do you have some method of excluding particular PCI devices from
being initialized by your host Linux? E.g. those devices that are later
to be assigned to some VMs (via VT-d passthrough)?
   

Yes, there is a stub driver that does this.

 

Does this stub driver sets DMA protections, so that the device in
question cannot access any host memory?

That is important, because once you assigned a device to some VM, we
should assume the VM might have somehow compromised the device, e.g.
reflashed the firmware of the NIC, perhaps. So, it's important to be
able to protect the hypervisor from such devices.
   


kvm places assigned devices in an iommu protection domain so it cannot 
attack the host.  Once the guest stops using the device, we reset it.  
If the guest is able to upload a malicious, persistent payload to the 
device, then when the device is reused whoever uses it will be 
vulnerable (whether a new guest or the host).


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: A few KVM security questions

2009-12-07 Thread Joanna Rutkowska

Avi Kivity wrote:

>> 1) Do you have any support for para-virtualized VMs?
> 
> Yes, for example, we support paravirtualized timers and mmu for Linux. 
> These are fairly minimal compared to Xen's pv domains.
> 

Can I run a regular Linux as PV-guest? Specifically, can I get rid of
qemu totally, assuming I have only PV guests?

E.g. do you have PV network and disk frontends (PV drivers), that I
could use on guests and that do not use qemu at all?

>> 2) Is it possible to have driver domains in KVM? E.g. I would like to
>> assign my NIC to one VM (a "network domain") and then I would like other
>> domains to use this network domain for networking. In case of Xen, this
>> is done by moving the network backend (which is not qemu BTW) into the
>> network domain, and configuring the network frontends in other VMs to
>> talk to this network domain's backend, rather then to Dom0's backend (in
>> fact you can get rid of all the networking in Dom0).
>>
> 
> Should be doable by assigning the NIC to a driver domain and bridging it
> to a virtio driver; then have the driver domain's virtio device talk to
> the ordinary guests.

But bridging would still require to have some networking support (+net
backends) on the host (sure, without any real NIC driver, but still),
correct?

>> 4) Do you have some method of excluding particular PCI devices from
>> being initialized by your host Linux? E.g. those devices that are later
>> to be assigned to some VMs (via VT-d passthrough)?
> 
> Yes, there is a stub driver that does this.
> 

Does this stub driver sets DMA protections, so that the device in
question cannot access any host memory?

That is important, because once you assigned a device to some VM, we
should assume the VM might have somehow compromised the device, e.g.
reflashed the firmware of the NIC, perhaps. So, it's important to be
able to protect the hypervisor from such devices.

Thanks,
joanna.

signature.asc
Description: OpenPGP digital signature

Re: A few KVM security questions

2009-12-07 Thread Avi Kivity


On 12/07/2009 03:05 PM, Joanna Rutkowska wrote:

Hello,

I have the following questions regarding the KVM architecture. I looked
at the slides available at linux-kvm.org, but didn't find definitive
answers. I'm also interested to learn if given feature is or is not
planned for the near future.

The questions follow:

1) Do you have any support for para-virtualized VMs?


Yes, for example, we support paravirtualized timers and mmu for Linux.  
These are fairly minimal compared to Xen's pv domains.



In particular, is
it possible to move the qemu from the host to one of the VMs? Perhaps to
have a separate copy of qemu for each VM? (ala Xen's stub-domains)
   


It should be fairly easy to place qemu in a guest.  You would leave a 
simple program on the host to communicate with kvm and pass any data 
written by the guest to qemu running in another guest, and feed any 
replies back to the guest.


It should also be possible to constrain qemu using SECCOMP.

None of this has been attempted to my knowledge.


2) Is it possible to have driver domains in KVM? E.g. I would like to
assign my NIC to one VM (a "network domain") and then I would like other
domains to use this network domain for networking. In case of Xen, this
is done by moving the network backend (which is not qemu BTW) into the
network domain, and configuring the network frontends in other VMs to
talk to this network domain's backend, rather then to Dom0's backend (in
fact you can get rid of all the networking in Dom0).
   


Should be doable by assigning the NIC to a driver domain and bridging it 
to a virtio driver; then have the driver domain's virtio device talk to 
the ordinary guests.



3) Do you have any support for TXT-based trusted boot? I guess you
indirectly have via tboot. However, how do you deal with VT-d
protections? The tboot.gz should normally DMA-protect memory before
handing execution over to Linux kernel. But then you need to allow your
drivers to work. Do you unprotect all the memory for DMA, or do you have
some support for selectively unprotect only those regions of memory
which are needed by (some) drivers? If the latter, how do you determine
which memory should be DMA-unprotected?
   


I know nothing about tboot.


4) Do you have some method of excluding particular PCI devices from
being initialized by your host Linux? E.g. those devices that are later
to be assigned to some VMs (via VT-d passthrough)?

   


Yes, there is a stub driver that does this.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

A few KVM security questions

2009-12-07 Thread Joanna Rutkowska

Hello,

I have the following questions regarding the KVM architecture. I looked
at the slides available at linux-kvm.org, but didn't find definitive
answers. I'm also interested to learn if given feature is or is not
planned for the near future.

The questions follow:

1) Do you have any support for para-virtualized VMs? In particular, is
it possible to move the qemu from the host to one of the VMs? Perhaps to
have a separate copy of qemu for each VM? (ala Xen's stub-domains)

2) Is it possible to have driver domains in KVM? E.g. I would like to
assign my NIC to one VM (a "network domain") and then I would like other
domains to use this network domain for networking. In case of Xen, this
is done by moving the network backend (which is not qemu BTW) into the
network domain, and configuring the network frontends in other VMs to
talk to this network domain's backend, rather then to Dom0's backend (in
fact you can get rid of all the networking in Dom0).

3) Do you have any support for TXT-based trusted boot? I guess you
indirectly have via tboot. However, how do you deal with VT-d
protections? The tboot.gz should normally DMA-protect memory before
handing execution over to Linux kernel. But then you need to allow your
drivers to work. Do you unprotect all the memory for DMA, or do you have
some support for selectively unprotect only those regions of memory
which are needed by (some) drivers? If the latter, how do you determine
which memory should be DMA-unprotected?

4) Do you have some method of excluding particular PCI devices from
being initialized by your host Linux? E.g. those devices that are later
to be assigned to some VMs (via VT-d passthrough)?

Thanks, I would appreciate any answers. Please note I'm not subscribed
to the list, so won't get your response if sent only to the list.

Regards,
joanna.

-- 
Joanna Rutkowska
Founder/CEO
Invisible Things Lab
http://invisiblethingslab.com/



signature.asc
Description: OpenPGP digital signature

Re: [Autotest][PATCH 1/2] add hackbench test to kvm autotest

2009-12-07 Thread Lucas Meneghel Rodrigues

FYI, this was already incorporated to the tree, thanks Sudhir!

On Fri, 2009-12-04 at 11:19 +0530, sudhir kumar wrote:
> This patch adds the hackbench test for the KVM linux guests.
> 
> Signed-off-by: Sudhir Kumar 
> 
> Index: kvm/autotest_control/hackbench.control
> ===
> --- /dev/null
> +++ kvm/autotest_control/hackbench.control
> @@ -0,0 +1,13 @@
> +AUTHOR = "Sudhir Kumar "
> +NAME = "Hackbench"
> +TIME = "SHORT"
> +TEST_CLASS = "Kernel"
> +TEST_CATEGORY = "Benchmark"
> +TEST_TYPE = "client"
> +
> +DOC = """
> +Hackbench is a benchmark which measures the performance, overhead and
> +scalability of the Linux scheduler.
> +
> +"""
> +job.run_test('hackbench')
> 
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/4] KVM: Add accessor for reading cr4 (or some bits of cr4)

2009-12-07 Thread Avi Kivity

Some bits of cr4 can be owned by the guest on vmx, so when we read them,
we copy them to the vcpu structure.  In preparation for making the set of
guest-owned bits dynamic, use helpers to access these bits so we don't need
to know where the bit resides.

No changes to svm since all bits are host-owned there.

Signed-off-by: Avi Kivity 
---
 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/kvm_cache_regs.h   |   12 
 arch/x86/kvm/mmu.h  |5 +++--
 arch/x86/kvm/vmx.c  |   13 -
 arch/x86/kvm/x86.c  |   16 ++--
 5 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index da6dee8..e9f4f12 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -272,6 +272,7 @@ struct kvm_vcpu_arch {
unsigned long cr2;
unsigned long cr3;
unsigned long cr4;
+   unsigned long cr4_guest_owned_bits;
unsigned long cr8;
u32 hflags;
u64 pdptrs[4]; /* pae */
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 7bcc5b6..35acc36 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -38,4 +38,16 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int 
index)
return vcpu->arch.pdptrs[index];
 }
 
+static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+   if (mask & vcpu->arch.cr4_guest_owned_bits)
+   kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+   return vcpu->arch.cr4 & mask;
+}
+
+static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
+{
+   return kvm_read_cr4_bits(vcpu, ~0UL);
+}
+
 #endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 61a1b38..4567d80 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -2,6 +2,7 @@
 #define __KVM_X86_MMU_H
 
 #include 
+#include "kvm_cache_regs.h"
 
 #define PT64_PT_BITS 9
 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -64,12 +65,12 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
 
 static inline int is_pae(struct kvm_vcpu *vcpu)
 {
-   return vcpu->arch.cr4 & X86_CR4_PAE;
+   return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
 }
 
 static inline int is_pse(struct kvm_vcpu *vcpu)
 {
-   return vcpu->arch.cr4 & X86_CR4_PSE;
+   return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
 }
 
 static inline int is_paging(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5ef820e..ae95a0c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1612,8 +1612,10 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 
 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 {
-   vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
-   vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
+   ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+
+   vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
+   vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
 }
 
 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -1658,7 +1660,7 @@ static void ept_update_paging_mode_cr0(unsigned long 
*hw_cr0,
 (CPU_BASED_CR3_LOAD_EXITING |
  CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
-   vmx_set_cr4(vcpu, vcpu->arch.cr4);
+   vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
} else if (!is_paging(vcpu)) {
/* From nonpaging to paging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1666,7 +1668,7 @@ static void ept_update_paging_mode_cr0(unsigned long 
*hw_cr0,
 ~(CPU_BASED_CR3_LOAD_EXITING |
   CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
-   vmx_set_cr4(vcpu, vcpu->arch.cr4);
+   vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
}
 
if (!(cr0 & X86_CR0_WP))
@@ -2417,6 +2419,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
+   vmx->vcpu.arch.cr4_guest_owned_bits = KVM_GUEST_CR4_MASK;
 
tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
rdtscll(tsc_this);
@@ -3047,7 +3050,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
vcpu->arch.eff_db[dr] = val;
break;
case 4 ... 5:
-   if (vcpu->arch.cr4 & X86_CR4_DE)
+   if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
kvm_queue_exception(vcpu, UD_VECTOR);
break;
case 6:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd15d7a..4a16337 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -481,7 +481,7 @@ EXPORT_SYMBOL_GPL(kvm_lmsw);
 
 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr

[PATCH 3/4] KVM: VMX: Make guest cr4 mask more conservative

2009-12-07 Thread Avi Kivity

Instead of specifying the bits which we want to trap on, specify the bits
which we allow the guest to change transparently.  This is safer wrt future
changes to cr4.

Signed-off-by: Avi Kivity 
---
 arch/x86/kvm/vmx.c |   10 ++
 1 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ae95a0c..d34fdd3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -69,8 +69,10 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
(X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
 #define KVM_VM_CR0_ALWAYS_ON   \
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
-#define KVM_GUEST_CR4_MASK \
-   (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
+#define KVM_CR4_GUEST_OWNED_BITS \
+   (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR  \
+| X86_CR4_OSXMMEXCPT)
+
 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
 
@@ -2418,8 +2420,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
 
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
-   vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
-   vmx->vcpu.arch.cr4_guest_owned_bits = KVM_GUEST_CR4_MASK;
+   vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+   vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 
tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
rdtscll(tsc_this);
-- 
1.6.5.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/4] cr4 optimizations for vmx/ept

2009-12-07 Thread Avi Kivity

When ept is enabled, we aren't particularly interested in cr4.pge, so allow
the guest to own it.  This improves performance in vmap() intensive loads.

Avi Kivity (4):
  KVM: VMX: Move some cr[04] related constants to vmx.c
  KVM: Add accessor for reading cr4 (or some bits of cr4)
  KVM: VMX: Make guest cr4 mask more conservative
  KVM: VMX: When using ept, allow the guest to own cr4.pge

 arch/x86/include/asm/kvm_host.h |   14 +-
 arch/x86/kvm/kvm_cache_regs.h   |   12 
 arch/x86/kvm/mmu.h  |5 +++--
 arch/x86/kvm/vmx.c  |   32 ++--
 arch/x86/kvm/x86.c  |   16 ++--
 5 files changed, 48 insertions(+), 31 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/4] KVM: VMX: When using ept, allow the guest to own cr4.pge

2009-12-07 Thread Avi Kivity

We make no use of cr4.pge if ept is enabled, but the guest does (to flush
global mappings, as with vmap()), so give the guest ownership of this bit.

Signed-off-by: Avi Kivity 
---
 arch/x86/kvm/vmx.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d34fdd3..2e47e65 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2421,6 +2421,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+   if (enable_ept)
+   vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 
tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
-- 
1.6.5.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/4] KVM: VMX: Move some cr[04] related constants to vmx.c

2009-12-07 Thread Avi Kivity

They have no place in common code.

Signed-off-by: Avi Kivity 
---
 arch/x86/include/asm/kvm_host.h |   13 -
 arch/x86/kvm/vmx.c  |   13 +
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4f865e8..da6dee8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -38,19 +38,6 @@
 #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS |   \
  0xFF00ULL)
 
-#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST  \
-   (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
-#define KVM_GUEST_CR0_MASK \
-   (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
-#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST
\
-   (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
-#define KVM_VM_CR0_ALWAYS_ON   \
-   (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
-#define KVM_GUEST_CR4_MASK \
-   (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
-#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
-#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
-
 #define INVALID_PAGE (~(hpa_t)0)
 #define UNMAPPED_GVA (~(gpa_t)0)
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9a0a2cf..5ef820e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -61,6 +61,19 @@ module_param_named(unrestricted_guest,
 static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
+#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST  \
+   (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
+#define KVM_GUEST_CR0_MASK \
+   (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST
\
+   (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
+#define KVM_VM_CR0_ALWAYS_ON   \
+   (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
+#define KVM_GUEST_CR4_MASK \
+   (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+
 /*
  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
  * ple_gap:upper bound on the amount of time between two successive
-- 
1.6.5.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Autotest] [PATCH] Add a server-side test - kvm_migration

2009-12-07 Thread sudhir kumar

Resending with proper cc list :(

On Mon, Dec 7, 2009 at 2:43 PM, sudhir kumar  wrote:
> Thanks for initiating the server side implementation of migration. Few
> comments below
>
> On Fri, Dec 4, 2009 at 1:48 PM, Yolkfull Chow  wrote:
>> This patch will add a server-side test namely kvm_migration. Currently,
>> it will use existing KVM client test framework and add a new file
>> kvm_migration.py to help judge executing routine: source machine or dest
>> machine.
>>
>> * One thing need to be considered/improved:
>> Whether we parse the kvm_tests.cfg on server machine or on client machines?
>> If parse it on client machines, we need to fix one problem that adding
>> 'start_vm_for_migration' parameter into dict which generated on dest machine.
> I think we can not manage with client side parsing without adding too
> much complexity. So let us continue parsing on the server side only
> for remote migration. Also as the patch does, keep the local migration
> under the client also. I do not like adding test variants in
> migration_control.srv. Comments below...
>>
>> So far I choose parsing kvm_tests.cfg on server machine, and then add
>> 'start_vm_for_migration' into dict cloned from original test dict for dest
>> machine.
>>
>> * In order to run this test so far, we need to setup NFS for both
>> source and dest machines.
>>
>> Signed-off-by: Yolkfull Chow 
>> ---
>>  client/tests/kvm/kvm_migration.py      |  165 
>> 
>>  client/tests/kvm/kvm_test_utils.py     |   27 +++---
>>  client/tests/kvm/kvm_tests.cfg.sample  |    2 +
>>  client/tests/kvm_migration             |    1 +
>>  server/tests/kvm/migration_control.srv |  137 ++
>>  5 files changed, 320 insertions(+), 12 deletions(-)
>>  create mode 100644 client/tests/kvm/kvm_migration.py
>>  create mode 12 client/tests/kvm_migration
>>  create mode 100644 server/tests/kvm/migration_control.srv
>>
>> diff --git a/client/tests/kvm/kvm_migration.py 
>> b/client/tests/kvm/kvm_migration.py
>> new file mode 100644
>> index 000..52cd3cd
>> --- /dev/null
>> +++ b/client/tests/kvm/kvm_migration.py
>> @@ -0,0 +1,165 @@
>> +import sys, os, time, logging, commands, socket
>> +from autotest_lib.client.bin import test
>> +from autotest_lib.client.common_lib import error
>> +import kvm_utils, kvm_preprocessing, common, kvm_vm, kvm_test_utils
>> +
>> +
>> +class kvm_migration(test.test):
>> +    """
>> +    KVM migration test.
>> +
>> +   �...@copyright: Red Hat 2008-2009
>> +   �...@see: http://www.linux-kvm.org/page/KVM-Autotest/Client_Install
>> +            (Online doc - Getting started with KVM testing)
>> +
>> +    Migration execution progress:
>> +
>> +    source host                     dest host
>> +    --
>> +    log into guest
>> +    --
>> +    start socket server
>> +
>> +     wait 30 secs -- wait login_timeout+30 secs---
>> +
>> +    accept connection             connect to socket server,send mig_port
>> +    --
>> +    start migration
>> +
>> +     wait 30 secs -- wait mig_timeout+30 secs-
>> +
>> +                                  try to log into migrated guest
>> +    --
>> +
>> +    """
>> +    version = 1
>> +    def initialize(self):
>> +        pass
>> +
>> +
>> +    def run_once(self, params):
>> +        """
>> +        Setup remote machine and then execute migration.
>> +        """
>> +        # Check whether remote machine is ready
>> +        dsthost = params.get("dsthost")
>> +        srchost = params.get("srchost")
>> +        image_path = os.path.join(self.bindir, "images")
>> +
>> +        rootdir = params.get("rootdir")
>> +        iso = os.path.join(rootdir, 'iso')
>> +        images = os.path.join(rootdir, 'images')
>> +        qemu = os.path.join(rootdir, 'qemu')
>> +        qemu_img = os.path.join(rootdir, 'qemu-img')
>> +
>> +        def link_if_not_exist(ldir, target, link_name):
>> +            t = target
>> +            l = os.path.join(ldir, link_name)
>> +            if not os.path.exists(l):
>> +                os.symlink(t,l)
>> +        link_if_not_exist(self.bindir, '../../', 'autotest')
>> +        link_if_not_exist(self.bindir, iso, 'isos')
>> +        link_if_not_exist(self.bindir, images, 'images')
>> +        link_if_not_exist(self.bindir, qemu, 'qemu')
>> +        link_if_not_exist(self.bindir, qemu_img, 'qemu-img')
>> +
>> +        # Report the parameters we've received and write them as keyvals
>> +        logging.debug("Test parameters:")
>> +        keys = params.keys()
>> +        keys.sort()
>> +        for key in keys:
>> +            logging.debug("    %s = %s", key, params[key])
>> +            self.write_test_keyval({key: params[key]})
>> +
>> +        # Ope

[PATCH] KVM: Fix possible circular locking in kvm_vm_ioctl_assign_device()

2009-12-07 Thread Sheng Yang

One possible order is:

KVM_CREATE_IRQCHIP ioctl(took kvm->lock) -> kvm_iobus_register_dev() ->
down_write(kvm->slots_lock).

The other one is in kvm_vm_ioctl_assign_device(), which take kvm->slots_lock
first, then kvm->lock.

Observe it due to kernel locking debug warnings.

Signed-off-by: Sheng Yang 
---
 virt/kvm/assigned-dev.c |6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index fd9c097..f73de63 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -508,8 +508,8 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
struct kvm_assigned_dev_kernel *match;
struct pci_dev *dev;
 
-   down_read(&kvm->slots_lock);
mutex_lock(&kvm->lock);
+   down_read(&kvm->slots_lock);
 
match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
  assigned_dev->assigned_dev_id);
@@ -573,8 +573,8 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
}
 
 out:
-   mutex_unlock(&kvm->lock);
up_read(&kvm->slots_lock);
+   mutex_unlock(&kvm->lock);
return r;
 out_list_del:
list_del(&match->list);
@@ -585,8 +585,8 @@ out_put:
pci_dev_put(dev);
 out_free:
kfree(match);
-   mutex_unlock(&kvm->lock);
up_read(&kvm->slots_lock);
+   mutex_unlock(&kvm->lock);
return r;
 }
 
-- 
1.5.4.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

54 matches

Mail list logo