Re: [PATCH 2/4] KVM: Add accessor for reading cr4 (or some bits of cr4)
On Monday 07 December 2009 18:47:10 Avi Kivity wrote: > Some bits of cr4 can be owned by the guest on vmx, so when we read them, > we copy them to the vcpu structure. In preparation for making the set of > guest-owned bits dynamic, use helpers to access these bits so we don't need > to know where the bit resides. > > No changes to svm since all bits are host-owned there. > > Signed-off-by: Avi Kivity > --- > arch/x86/include/asm/kvm_host.h |1 + > arch/x86/kvm/kvm_cache_regs.h | 12 > arch/x86/kvm/mmu.h |5 +++-- > arch/x86/kvm/vmx.c | 13 - > arch/x86/kvm/x86.c | 16 ++-- > 5 files changed, 30 insertions(+), 17 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h > b/arch/x86/include/asm/kvm_host.h index da6dee8..e9f4f12 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -272,6 +272,7 @@ struct kvm_vcpu_arch { > unsigned long cr2; > unsigned long cr3; > unsigned long cr4; > + unsigned long cr4_guest_owned_bits; > unsigned long cr8; > u32 hflags; > u64 pdptrs[4]; /* pae */ > diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h > index 7bcc5b6..35acc36 100644 > --- a/arch/x86/kvm/kvm_cache_regs.h > +++ b/arch/x86/kvm/kvm_cache_regs.h > @@ -38,4 +38,16 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, > int index) return vcpu->arch.pdptrs[index]; > } > > +static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) > +{ > + if (mask & vcpu->arch.cr4_guest_owned_bits) > + kvm_x86_ops->decache_cr4_guest_bits(vcpu); > + return vcpu->arch.cr4 & mask; > +} > + > +static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) > +{ > + return kvm_read_cr4_bits(vcpu, ~0UL); > +} > + > #endif > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h > index 61a1b38..4567d80 100644 > --- a/arch/x86/kvm/mmu.h > +++ b/arch/x86/kvm/mmu.h > @@ -2,6 +2,7 @@ > #define __KVM_X86_MMU_H > > #include > +#include "kvm_cache_regs.h" > > #define PT64_PT_BITS 9 > #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) > @@ -64,12 +65,12 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) > > static inline int is_pae(struct kvm_vcpu *vcpu) > { > - return vcpu->arch.cr4 & X86_CR4_PAE; > + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); > } > > static inline int is_pse(struct kvm_vcpu *vcpu) > { > - return vcpu->arch.cr4 & X86_CR4_PSE; > + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); > } > > static inline int is_paging(struct kvm_vcpu *vcpu) > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 5ef820e..ae95a0c 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -1612,8 +1612,10 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) > > static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) > { > - vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; > - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; > + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; > + > + vcpu->arch.cr4 &= ~cr4_guest_owned_bits; > + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; > } > > static void ept_load_pdptrs(struct kvm_vcpu *vcpu) > @@ -1658,7 +1660,7 @@ static void ept_update_paging_mode_cr0(unsigned long > *hw_cr0, (CPU_BASED_CR3_LOAD_EXITING | > CPU_BASED_CR3_STORE_EXITING)); > vcpu->arch.cr0 = cr0; > - vmx_set_cr4(vcpu, vcpu->arch.cr4); > + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); > } else if (!is_paging(vcpu)) { > /* From nonpaging to paging */ > vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, > @@ -1666,7 +1668,7 @@ static void ept_update_paging_mode_cr0(unsigned long > *hw_cr0, ~(CPU_BASED_CR3_LOAD_EXITING | > CPU_BASED_CR3_STORE_EXITING)); > vcpu->arch.cr0 = cr0; > - vmx_set_cr4(vcpu, vcpu->arch.cr4); > + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); > } Another place accessed cr4 directly, in ept_update_paging_mode_cr4() >} else if (!(vcpu->arch.cr4 & X86_CR4_PAE)) >*hw_cr4 &= ~X86_CR4_PAE; Others looks fine to me. -- regards Yang, Sheng -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Autotest] [PATCH] Add a server-side test - kvm_migration
On Mon, Dec 07, 2009 at 03:35:54PM +0530, sudhir kumar wrote: > Resending with proper cc list :( > > On Mon, Dec 7, 2009 at 2:43 PM, sudhir kumar wrote: > > Thanks for initiating the server side implementation of migration. Few > > comments below > > > > On Fri, Dec 4, 2009 at 1:48 PM, Yolkfull Chow wrote: > >> This patch will add a server-side test namely kvm_migration. Currently, > >> it will use existing KVM client test framework and add a new file > >> kvm_migration.py to help judge executing routine: source machine or dest > >> machine. > >> > >> * One thing need to be considered/improved: > >> Whether we parse the kvm_tests.cfg on server machine or on client machines? > >> If parse it on client machines, we need to fix one problem that adding > >> 'start_vm_for_migration' parameter into dict which generated on dest > >> machine. > > I think we can not manage with client side parsing without adding too > > much complexity. So let us continue parsing on the server side only > > for remote migration. Also as the patch does, keep the local migration > > under the client also. I do not like adding test variants in > > migration_control.srv. Comments below... > >> > >> So far I choose parsing kvm_tests.cfg on server machine, and then add > >> 'start_vm_for_migration' into dict cloned from original test dict for dest > >> machine. > >> > >> * In order to run this test so far, we need to setup NFS for both > >> source and dest machines. > >> > >> Signed-off-by: Yolkfull Chow > >> --- > >> client/tests/kvm/kvm_migration.py | 165 > >> > >> client/tests/kvm/kvm_test_utils.py | 27 +++--- > >> client/tests/kvm/kvm_tests.cfg.sample | 2 + > >> client/tests/kvm_migration | 1 + > >> server/tests/kvm/migration_control.srv | 137 ++ > >> 5 files changed, 320 insertions(+), 12 deletions(-) > >> create mode 100644 client/tests/kvm/kvm_migration.py > >> create mode 12 client/tests/kvm_migration > >> create mode 100644 server/tests/kvm/migration_control.srv > >> > >> diff --git a/client/tests/kvm/kvm_migration.py > >> b/client/tests/kvm/kvm_migration.py > >> new file mode 100644 > >> index 000..52cd3cd > >> --- /dev/null > >> +++ b/client/tests/kvm/kvm_migration.py > >> @@ -0,0 +1,165 @@ > >> +import sys, os, time, logging, commands, socket > >> +from autotest_lib.client.bin import test > >> +from autotest_lib.client.common_lib import error > >> +import kvm_utils, kvm_preprocessing, common, kvm_vm, kvm_test_utils > >> + > >> + > >> +class kvm_migration(test.test): > >> + """ > >> + KVM migration test. > >> + > >> + �...@copyright: Red Hat 2008-2009 > >> + �...@see: http://www.linux-kvm.org/page/KVM-Autotest/Client_Install > >> + (Online doc - Getting started with KVM testing) > >> + > >> + Migration execution progress: > >> + > >> + source host dest host > >> + -- > >> + log into guest > >> + -- > >> + start socket server > >> + > >> + wait 30 secs -- wait login_timeout+30 secs--- > >> + > >> + accept connection connect to socket server,send mig_port > >> + -- > >> + start migration > >> + > >> + wait 30 secs -- wait mig_timeout+30 secs- > >> + > >> + try to log into migrated guest > >> + -- > >> + > >> + """ > >> + version = 1 > >> + def initialize(self): > >> + pass > >> + > >> + > >> + def run_once(self, params): > >> + """ > >> + Setup remote machine and then execute migration. > >> + """ > >> + # Check whether remote machine is ready > >> + dsthost = params.get("dsthost") > >> + srchost = params.get("srchost") > >> + image_path = os.path.join(self.bindir, "images") > >> + > >> + rootdir = params.get("rootdir") > >> + iso = os.path.join(rootdir, 'iso') > >> + images = os.path.join(rootdir, 'images') > >> + qemu = os.path.join(rootdir, 'qemu') > >> + qemu_img = os.path.join(rootdir, 'qemu-img') > >> + > >> + def link_if_not_exist(ldir, target, link_name): > >> + t = target > >> + l = os.path.join(ldir, link_name) > >> + if not os.path.exists(l): > >> + os.symlink(t,l) > >> + link_if_not_exist(self.bindir, '../../', 'autotest') > >> + link_if_not_exist(self.bindir, iso, 'isos') > >> + link_if_not_exist(self.bindir, images, 'images') > >> + link_if_not_exist(self.bindir, qemu, 'qemu') > >> + link_if_not_exist(self.bindir, qemu_img, 'qemu-img') > >> + > >> + # Report the parameters we've received and
Re: [PATCH] virtio spec: add virtio-blk max sectors feature
On Thu, 3 Dec 2009 08:28:38 pm Avi Kivity wrote: > On 12/03/2009 10:42 AM, Avishay Traeger1 wrote: > > I previously submitted a patch to have the guest virtio-blk driver get the > > value for the maximum I/O size from the host bdrv, rather than assume that > > there is no limit. Avi requested that I first patch the virtio spec > > (http://ozlabs.org/~rusty/virtio-spec/). Below is that patch. > > > > Please CC me on replies, as I am not subscribed. > > > > > > Copying Rusty and virtualizat...@. Thanks Avi... Avishay; this would be the total sectors in an I/O, as separate from SIZE_MAX (maximum size of any single scatterlist entry) and SEG_MAX (maximum number of scatterlist entries)? Seems like a reasonable idea; esp if you need it. Thanks! Rusty. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[AUTOTEST PATCH] KVM test: subtest block_hotplug: Fixup pci_test_cmd in config file
RHEL-4.8 is still using 'hd[a-z]' as harddisk device name. This patch adds 'h' to regular expression in command `pci_test_cmd'. Signed-off-by: Yolkfull Chow --- client/tests/kvm/kvm_tests.cfg.sample |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/client/tests/kvm/kvm_tests.cfg.sample b/client/tests/kvm/kvm_tests.cfg.sample index 20ae332..73c593a 100644 --- a/client/tests/kvm/kvm_tests.cfg.sample +++ b/client/tests/kvm/kvm_tests.cfg.sample @@ -217,7 +217,7 @@ variants: image_size_stg = 1G remove_image_stg = yes force_create_image_stg = yes -pci_test_cmd = "yes | mke2fs `fdisk -l 2>&1 | awk '/\/dev\/[sv]d[a-z] doesn/ {print $2}'`" +pci_test_cmd = "yes | mke2fs `fdisk -l 2>&1 | awk '/\/dev\/[hsv]d[a-z] doesn/ {print $2}'`" wait_secs_for_hook_up = 3 kill_vm_on_error = yes variants: -- 1.6.5.2 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/5] KVM test: Major control file cleanup
As pointed out before, the KVM reference control files could use a little clean up. This patch implements severe cleanup of the main control file by: * Refactoring the code present there, moving it to the kvm_utils.py library * Treat the build test exactly the same way as other tests, moving the config stuff that used to be in the control file realm out to its own configuration file, for the sake of consistency. This way the control file becomes way shorter, fairly well organized, and we have a consistent configuration schema across the board, based on configuration files. If people are OK with this change, final patch will change the control.parallel file as well. 2nd try: Implemented pretty much all Michael's suggestions for this patchset. Signed-off-by: Lucas Meneghel Rodrigues --- client/tests/kvm/control | 219 +++-- client/tests/kvm/kvm_utils.py | 51 ++ 2 files changed, 86 insertions(+), 184 deletions(-) diff --git a/client/tests/kvm/control b/client/tests/kvm/control index a526cc0..163286e 100644 --- a/client/tests/kvm/control +++ b/client/tests/kvm/control @@ -6,7 +6,7 @@ dh...@redhat.com (David Huff) aerom...@redhat.com (Alexey Eromenko) mbu...@redhat.com (Mike Burns) """ -TIME = 'SHORT' +TIME = 'MEDIUM' NAME = 'KVM test' TEST_TYPE = 'client' TEST_CLASS = 'Virtualization' @@ -20,194 +20,45 @@ KVM (both kernelspace and userspace) code. For online docs, please refer to http://www.linux-kvm.org/page/KVM-Autotest """ +import sys, os, logging +# Add the KVM tests dir to the python path +kvm_test_dir = os.path.join(os.environ['AUTODIR'],'tests/kvm') +sys.path.append(kvm_test_dir) +# Now we can import modules inside the KVM tests dir +import kvm_utils, kvm_config -import sys, os - -#- # set English environment (command output might be localized, need to be safe) -#- os.environ['LANG'] = 'en_US.UTF-8' -#- -# Enable modules import from current directory (tests/kvm) -#- -pwd = os.path.join(os.environ['AUTODIR'],'tests/kvm') -sys.path.append(pwd) - -# -# create required symlinks -# -# When dispatching tests from autotest-server the links we need do not exist on -# the host (the client). The following lines create those symlinks. Change -# 'rootdir' here and/or mount appropriate directories in it. -# -# When dispatching tests on local host (client mode) one can either setup kvm -# links, or same as server mode use rootdir and set all appropriate links and -# mount-points there. For example, guest installation tests need to know where -# to find the iso-files. -# -# We create the links only if not already exist, so if one already set up the -# links for client/local run we do not touch the links. -rootdir='/tmp/kvm_autotest_root' -iso=os.path.join(rootdir, 'iso') -images=os.path.join(rootdir, 'images') -qemu=os.path.join(rootdir, 'qemu') -qemu_img=os.path.join(rootdir, 'qemu-img') - - -def link_if_not_exist(ldir, target, link_name): -t = target -l = os.path.join(ldir, link_name) -if not os.path.exists(l): -os.system('ln -s %s %s' % (t, l)) - -# Create links only if not already exist -link_if_not_exist(pwd, '../../', 'autotest') -link_if_not_exist(pwd, iso, 'isos') -link_if_not_exist(pwd, images, 'images') -link_if_not_exist(pwd, qemu, 'qemu') -link_if_not_exist(pwd, qemu_img, 'qemu-img') - -# -# Params that will be passed to the KVM install/build test -# -params = { -"name": "build", -"shortname": "build", -"type": "build", -#"mode": "release", -#"mode": "snapshot", -#"mode": "localtar", -#"mode": "localsrc", -#"mode": "git", -"mode": "noinstall", -#"mode": "koji", - -## Are we going to load modules built by this test? -## Defaults to 'yes', so if you are going to provide only userspace code to -## be built by this test, please set load_modules to 'no', and make sure -## the kvm and kvm-[vendor] module is already loaded by the time you start -## it. -#"load_modules": "no", - -## Install from a kvm release ("mode": "release"). You can optionally -## specify a release tag. If you omit it, the test will get the latest -## release tag available. -#"release_tag": '84', -#"release_dir": 'http://downloads.sourceforge.net/project/kvm/', -# This is the place that contains the sourceforge project list of files -#"release_listing": 'http://sourceforge.net/projects/kvm/files/', - -## Install from a kvm snapshot location ("mode": "snapshot"). You can -## optionally specify a snapshot date. If you omit it, the
[PATCH 3/5] KVM test: Verify paths to cdrom and qemu on kvm_preprocessing
If paths to CD images and qemu binaries are not correctly configured, the tests will fail, sometimes giving to unexperienced users little clue about what is actually going on. So make sure we verify: * ISO paths * qemu binary paths Inside kvm_preprocessing code, and give clear indications if something goes wrong, asking the user to fix the configuration problem. Signed-off-by: Lucas Meneghel Rodrigues --- client/tests/kvm/kvm_preprocessing.py | 48 + 1 files changed, 48 insertions(+), 0 deletions(-) diff --git a/client/tests/kvm/kvm_preprocessing.py b/client/tests/kvm/kvm_preprocessing.py index 5bae2bd..85a2d9c 100644 --- a/client/tests/kvm/kvm_preprocessing.py +++ b/client/tests/kvm/kvm_preprocessing.py @@ -187,6 +187,29 @@ def preprocess(test, params, env): @param params: A dict containing all VM and image parameters. @param env: The environment (a dict-like object). """ +# Verify if the: +# * CD locations +# * qemu and qemu-img binaries +# are valid paths +needed_paths = [[params.get("cdrom", ""), + os.path.join(test.bindir, 'isos')], +[params.get("qemu_binary", ""), test.bindir], +[params.get("qemu_img_binary", ""), test.bindir]] + +missing_paths = [] +for needed_path, root_dir in needed_paths: +# If the test doesn't set one of the parameters, +# just don't check for it. +if needed_path: +needed_path = kvm_utils.get_path(root_dir, needed_path) +if not _is_path_present(needed_path): +missing_paths.append(needed_path) + +if missing_paths: +raise error.TestError("The following needed paths are missing " + "or are broken symbolic links: %s" % + missing_paths) + # Start tcpdump if it isn't already running if not env.has_key("address_cache"): env["address_cache"] = {} @@ -343,3 +366,28 @@ def _update_address_cache(address_cache, line): mac_address, address_cache.get("last_seen")) address_cache[mac_address] = address_cache.get("last_seen") del address_cache["last_seen"] + + +def _is_path_present(path): +""" +Verify whether a given path to a file is present (follows symlinks). + +@param path: Path to the file. +@return: True when the file is present, False when it's not. +""" +exists = True + +if os.path.islink(path): +source = os.path.abspath(os.readlink(path)) +if not os.path.isfile(source): +logging.warning("File %s, needed for this test, " +"is a broken symbolic link. Please fix your " +"test configuration." % path) +exists = False +elif not os.path.isfile(path): +logging.warning("File %s, needed for this test, does not exist. " +"Please fix your test configuration." % path) +exists = False + +return exists + -- 1.6.5.2 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/5] KVM test: Unattended script: Make qemu_img_bin come from test params
Instead of hard coding the path to qemu-img on the unattended_install script, let's pick it up from the test parameters. Signed-off-by: Lucas Meneghel Rodrigues --- client/tests/kvm/scripts/unattended.py |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/client/tests/kvm/scripts/unattended.py b/client/tests/kvm/scripts/unattended.py index 2667649..562d317 100755 --- a/client/tests/kvm/scripts/unattended.py +++ b/client/tests/kvm/scripts/unattended.py @@ -53,7 +53,7 @@ class UnattendedInstall(object): cdrom_iso = os.environ['KVM_TEST_cdrom'] self.unattended_file = os.environ['KVM_TEST_unattended_file'] -self.qemu_img_bin = os.path.join(kvm_test_dir, 'qemu-img') +self.qemu_img_bin = os.environ['KVM_TEST_qemu_img_binary'] self.cdrom_iso = os.path.join(kvm_test_dir, cdrom_iso) self.floppy_mount = tempfile.mkdtemp(prefix='floppy_', dir='/tmp') self.cdrom_mount = tempfile.mkdtemp(prefix='cdrom_', dir='/tmp') -- 1.6.5.2 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] qemu-kvm: fix ia64 build breakage
Hi Xiantao, On 12.08.2009, at 06:03, Zhang, Xiantao wrote: > From 2d3d6cf55f7fecd9a9fd7c764e43b1ee56c7eebb Mon Sep 17 00:00:00 2001 > From: Xiantao Zhang > Date: Wed, 12 Aug 2009 11:39:33 +0800 > Subject: [PATCH] qemu-kvm: fix ia64 build breakage > > fix some configure issues. Do you have any plans to get the IA64 target building for 0.12 again? Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: KVM: MMU: remove prefault from invlpg handler
On Sat, Dec 05, 2009 at 10:15:44PM +0200, Avi Kivity wrote: > On 12/05/2009 09:42 PM, Marcelo Tosatti wrote: >> >>> I don't think the OS has "other mechanisms", though - the processor can >>> speculate the tlb so that would be an OS bug. >> >> Can it? I figured it relied on the fact that no access (therefore no TLB >> entry instantiation) meant there is no need to invlpg (since there is >> nothing in the TLB to invalidate), before updating a particular pte. >> >> The documentation states that invlpg invalidates any entries for the >> linear address. >> > > 4.10.1.3 says, "The processor may cache translations required for > prefetches and for accesses that are a result of speculative execution > that would never actually occur in the executed code path.", so there is > no way for the OS to ensure no access has occurred. If you change a > present pte, you must execute invlpg afterwards to ensure speculation > hasn't instantiated the old pte. > > >>> It looks like a race: >>> Signed-off-by: Marcelo Tosatti diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a601713..58a0f1e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -455,8 +455,6 @@ out_unlock: static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; - pt_element_t gpte; - gpa_t pte_gpa = -1; int level; u64 *sptep; int need_flush = 0; @@ -470,10 +468,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (level == PT_PAGE_TABLE_LEVEL || ((level == PT_DIRECTORY_LEVEL&& is_large_pte(*sptep))) || ((level == PT_PDPE_LEVEL&& is_large_pte(*sptep { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); - - pte_gpa = (sp->gfn<< PAGE_SHIFT); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); if (is_shadow_present_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); @@ -492,18 +486,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (need_flush) kvm_flush_remote_tlbs(vcpu->kvm); spin_unlock(&vcpu->kvm->mmu_lock); - - if (pte_gpa == -1) - return; - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa,&gpte, -sizeof(pt_element_t))) - return; >>> >>> >>> Here, another vcpu updates the gpte and issues a new invlpg. >>> >>> - if (is_present_gpte(gpte)&& (gpte& PT_ACCESSED_MASK)) { - if (mmu_topup_memory_caches(vcpu)) - return; - kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, -sizeof(pt_element_t), 0); - } >>> >>> >>> And here we undo the correct invlpg with the outdated gpte. >>> >>> Looks like we considered this, since kvm_read_guest_atomic() is only >>> needed if inside the spinlock, but some other change moved the >>> spin_unlock() upwards. Will investigate history. >> >> Isnt it the OS responsability to serialize pte updates + invlpg between >> CPUs? > > It is. Do you still have a trace of the error? Maybe we can understand > what the guest thought it was doing. BAD_POOL_HEADER (19) The pool is already corrupt at the time of the current request. This may or may not be due to the caller. The internal pool links must be walked to figure out a possible cause of the problem, and then special pool applied to the suspect tags or the driver verifier to a suspect driver. Arguments: Arg1: 0021, the data following the pool block being freed is corrupt. Typically this means the consumer (call stack ) has overrun the block. Arg2: 95424000, The pool pointer being freed. Arg3: 1010, The number of bytes allocated for the pool block. Arg4: , The corrupted value found following the pool block. The BAD_POOL_HEADER BSOD happens at address 0xF8A000DDD000 (complaining it contains "00", Arg4). Walking the pagetables takes to 0x18996 as the pte page: (qemu) xp 0x18996ee8 (vaddr 0xF8A000DDD000) 18996ee8: 0x153c9963 (qemu) xp 0x18996ef0 (vaddr 0xF8A000DDE000) 18996ef0: 0x1528a963 qemu-system-x86-13667 [007] 425860.260987: kvm_mmu_pte_write: sp->gfn 18996 (offset=ef0) gfn 15f11 invlpg=1 qemu-system-x86-13670 [004] 425860.264977: kvm_mmu_pte_write: sp->gfn 18996 (offset=ef0) gfn 15253 invlpg=1 qemu-system-x86-13670 [004] 425860.265039: kvm_mmu_pte_write: sp->gfn 18996 (offset=ef0) gfn 15f15 invlpg=1 qemu-system-x86-13670 [004] 425860.266591: kvm_mmu_pte_write: sp->gfn 18996 (offset=ef0) gfn 146f3 invlpg=1 qemu-system-x86-13670 [004] 425860.268128: kvm_mm
Re: A few KVM security questions
Muli Ben-Yehuda wrote: On Mon, Dec 07, 2009 at 11:38:52AM -0600, Anthony Liguori wrote: I'm skeptical that VT-d in its current form provides protection against a malicious guest. The first problem is interrupt delivery. I don't think any hypervisor has really put much thought into mitigating interrupt storms as a DoS. I think there are a number of nasty things that can be done here. Seems to me that detecting an interrupt storm and shutting the offending domain and device off is fairly easy for MSI and MSI-X interrupts, and not-interesting for legacy INTx interrupts. I don't know that any hypervisor actually implements it, though. Even if you assume that there aren't flaws in VT-d wrt malicious guests, we have generations of hardware that have not been designed to be robust against malicious operating systems. There are almost certainly untold numbers of exploitable hardware bugs that can be used to do all sorts of terrible things to the physical system. To the device? Undoubtedly. To the host? I'm not so sure. But in the context of SR-IOV, impacting the device may result in disrupting (and potentially exploiting) other domains. And I'm waiting for the "malicious guest sets server on fire" CVE :-) I'm convinced there will be at least one. VT-d protects against DMA access, but there's still plenty of things a malicious PCI device can do to harm the physical system. I'm sure you could easily program a PCI device to flood the bus which effectively mounts a DoS against other domains. But is there any way the device could do this and also evade detection of evade being taken off-line by the host, after first killing its controlling VM? Thing is, the bus is shared by the host too. So if the guest is able to bring all IO devices on the system to a halt, an administrator certainly couldn't connect remotely to take corrective action. I think all of this could potentially be detected and handled but I assume there's years of research here before that's a reality. There is no mechanism to arbitrate this today. It's really a dramatically different model from a security perspective. I think we need to differentiate between assigning full (legacy) devices, and assigning an SRIOV VF. In the latter---more interesting---case, the host remains in control of the overall device, so shutting off a mis-behaving VF should be simple. SR-IOV is worse IMHO because now there are multiple guests that can be impacted by a hardware exploit. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
virtio-net offloads not enabled with latest qemu-kvm
With the latest upstream qemu-kvm git tree, all the offloads are disabled on virtio-net. peer_has_vnet_hdr(n) in virtio_net_get_features() is failing because n->vc->peer is NULL. Could not figure out yet why peer field is not initialized. Do i need any new options to be specified with qemu command? qemu-system-x86_64 -m 1024 -kernel /boot/vmlinuz-2.6.32-guest -append 'root=/dev/vda1 console=tty0 console=ttyS0,115200' -initrd /boot/initrd-2.6.32-guest.img -drive file=/kvm_images/fedora10-2-vm,if=virtio,index=0 -net nic,macaddr=54:52:00:35:e3:74,model=virtio -net tap,ifname=vnet1,script=no,downscript=no Works fine with qemu-kvm-0.11.0 and i see checksum/tso/gso are enabled on guest virtio-net device. Thanks Sridhar -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: A few KVM security questions
On 12/07/2009 07:33 PM, Joanna Rutkowska wrote: AFAIK VT-d is only supported in Xen for fully virtualized guests. Maybe it changed while I wasn't watching, though. Negative. VT-d can be used to contain PV DomUs as well. We actually verified it. Ah, good for them. It can use read() and write() (and shared memory) to communicate, just like Xen stub domains. Well, but the read() and write() syscalls, on a system like Linux, it's a gate to *lots* of code. These are very powerful system calls. But you control all the file descriptors. A minimal system would just consist of a pair of eventfd fds for signalling and shared memory (the Xen equivalent is event channels and grant tables). It's a lot of surgery, but it can be done. And then you have the code with whom this qemu communicates (e.g. the network stack). You said we could somehow use IPC to delegate it to some VM (that would have VT-d assigned NIC). But then this VM would need to use qemu again (of course this time not for net emulation). Looks non-trivial. It doesn't really need to be a VM. Once the seccomp constrained qemu processes the guest actions, the result is a fairly simple event stream. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM test: Make sure resources_check use nic_mode=tap
nic_mode=tap is required for making physical_resources to work Signed-off-by: Lucas Meneghel Rodrigues --- client/tests/kvm/kvm_tests.cfg.sample |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/client/tests/kvm/kvm_tests.cfg.sample b/client/tests/kvm/kvm_tests.cfg.sample index 20ae332..e08bca4 100644 --- a/client/tests/kvm/kvm_tests.cfg.sample +++ b/client/tests/kvm/kvm_tests.cfg.sample @@ -266,6 +266,7 @@ variants: - physical_resources_check: install setup unattended_install type = physical_resources_check +nic_mode = tap catch_uuid_cmd = dmidecode | awk -F: '/UUID/ {print $2}' # NICs -- 1.6.5.2 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: A few KVM security questions
Anthony Liguori wrote: > Joanna Rutkowska wrote: >> Anthony Liguori wrote: >> >>> Avi Kivity wrote: >>> No. Paravirtualization just augments the standard hardware interface, it doesn't replace it as in Xen. >>> NB, unlike Xen, we can (and do) run qemu as non-root. Things like >>> RHEV-H and oVirt constrain the qemu process with SELinux. >>> >>> >> >> On Xen you can get rid of the qemu entirely, if you run only PV domains. >> >> >>> Also, you can use qemu to provide the backends to a Xen PV guest (see -M >>> xenpv). The effect is that you are moving that privileged code from the >>> kernel (netback/blkback) to userspace (qemu -M xenpv). >>> >>> In general, KVM tends to keep code in userspace unless absolutely >>> necessary. That's a fundamental difference from Xen which tends to do >>> the opposite. >>> >>> >> >> But the difference is that in case of Xen one can *easily* move the >> backends to small unprivileged VMs. In that case it doesn't matter the >> code is in kernel mode, it's still only in an unprivileged domain. >> > > Right, in KVM, Linux == hypervisor. A process is our "unprivileged > domain". Putting an unprivileged domain within an unprivileged domain > is probably not helpful from a security perspective since the exposure > surface is identical. > >> Sandboxing a process in a monolithic OS, like Linux, is generally >> considered unfeasible, for anything more complex than a hello world >> program. The process <-> kernel interface seem to be just too fat. See >> e.g. the recent Linux kernel overflows by Spender. >> > > That's the point of mandatory access control. Of course, you need the > right policy and Spender highlighted an issue with the standard RHEL > SELinux policy, but that should be addressed now upstream. > >> Also, SELinux seems to me like a step into the wrong direction. It not >> only adds complexity to the already-too-complex kernel, but requires >> complex configuration. See e.g. this paper[1] for a nice example of how >> to escape SE-sandboxed qemu on FC8 due to SELinux policy >> misconfiguration. >> >> When some people tried to add SELinux-like-thing to Xen hypervisor, it >> only resulted in an exploitable heap overflow in Xen [2]. >> > > It's certainly fair to argue the merits of SELinux as a mandatory access > control mechanism. > > Again though, that's the point of MLS. Our first line of defense is > qemu. Our second line of defense is traditional Posix direct access > control. Our third line of defense is namespace isolation (ala lxc). > Our fourth line of defense is mandatory access control (ala SELinux and > AppArmor). > > If you take a somewhat standard deployment like RHEV-H, an awful lot of > things have to go wrong before you can successfully exploit the system. > And 5.4 doesn't even implement all of what's possible. If you're really > looking to harden, you can be much more aggressive about privileges and > namespace isolation. > I think this ultimately comes down to the question: is the built-from-scratch minimal PV interface (as in Xen) more secure than the Linux's fat-but-sandboxed interface? joanna. signature.asc Description: OpenPGP digital signature
Test failures during git daily testing
Hi Folks: Today pretty much all install tests for kvm and qemu upstream git failed. the vm screen says something along the lines: Starting SeaBIOS [version-string] No bootable device. Screenshot attached. The command line the test used: /usr/local/autotest/tests/kvm/qemu -name 'vm1' -monitor unix:/tmp/monitor-20091207-120625-tyjI,server,nowait -drive file=/usr/local/autotest/tests/kvm/images/fc11-32.qcow2,if=ide -net nic,vlan=0 -net user,vlan=0 -m 512 -smp 1 -cdrom /usr/local/autotest/tests/kvm/isos/linux/Fedora-11-i386-DVD.iso -fda /usr/local/autotest/tests/kvm/images/floppy.img -tftp /usr/local/autotest/tests/kvm/images/tftpboot -boot d -bootp /pxelinux.0 -boot n -redir tcp:5000::22 -vnc :0 There's a concern that the command that was used might no longer be valid. If that's the case, please advise. Lucas <>
Re: A few KVM security questions
On Mon, Dec 07, 2009 at 06:09:55PM +0100, Joanna Rutkowska wrote: > > Also, SELinux seems to me like a step into the wrong direction. It not > only adds complexity to the already-too-complex kernel, but requires > complex configuration. See e.g. this paper[1] for a nice example of how > to escape SE-sandboxed qemu on FC8 due to SELinux policy misconfiguration. Things have changed alot since the time the that Xen SELinux policy was written. The Xen policy was always a tradeoff between usability & security sine the XenD managment tools were playing no part in the configuration, leaving it upto the administrator. With KVM & SELinx, the management tools play an active part in configuration, removing this burden from the adminsitrator. Each VM runs under a SELinux context with a dedicated MLS category, and the resources the VM is assigned have their labelling set to match. The guest policy only allows it access to resources with a matching MLS level, so it not gain access to anything the administrator has not explicitly granted in the VM's configuration. This is actually simpler for administrators, since they no longer need to manage labelling themselves, while offering greater protection between VMs which was also not possible with the old Xen policy Regards, Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: A few KVM security questions
Anthony Liguori wrote: > Joanna Rutkowska wrote: >> Avi Kivity wrote: >> >>> On 12/07/2009 07:09 PM, Joanna Rutkowska wrote: >>> > Also, you can use qemu to provide the backends to a Xen PV guest > (see -M > xenpv). The effect is that you are moving that privileged code > from the > kernel (netback/blkback) to userspace (qemu -M xenpv). > > In general, KVM tends to keep code in userspace unless absolutely > necessary. That's a fundamental difference from Xen which tends to do > the opposite. > > But the difference is that in case of Xen one can *easily* move the backends to small unprivileged VMs. In that case it doesn't matter the code is in kernel mode, it's still only in an unprivileged domain. >>> They're not really unprivileged, one can easily program the dma >>> controller of their assigned pci card to read and write arbitrary host >>> memory. >>> >>> >> >> That's not true if you use VT-d. >> > > I'm skeptical that VT-d in its current form provides protection against > a malicious guest. The first problem is interrupt delivery. I don't > think any hypervisor has really put much thought into mitigating > interrupt storms as a DoS. I think there are a number of nasty things > that can be done here. > Intel VT-d v1 doesn't support interrupt remapping, so I'm sure you're right here. But DoS attack is a different thing then a system subversion (think malware) attack. Of course which one you fear more would depend on your threat model. > Even if you assume that there aren't flaws in VT-d wrt malicious guests, > we have generations of hardware that have not been designed to be robust > against malicious operating systems. There are almost certainly untold > numbers of exploitable hardware bugs that can be used to do all sorts of > terrible things to the physical system. > Perhaps, although so far nobody presented a software-only VT-d escape attack. I think it's reasonable to assume some maniacs would discover a one or two in the coming years. Still, probably order of magnitude less likely than a Linux kernel overflow. > VT-d protects against DMA access, but there's still plenty of things a > malicious PCI device can do to harm the physical system. I'm sure you > could easily program a PCI device to flood the bus which effectively > mounts a DoS against other domains. There is no mechanism to arbitrate > this today. It's really a dramatically different model from a security > perspective. > Agree, there are lots of DoS possibilities. It's just that for me, personally, they are not in the threat model. joanna. signature.asc Description: OpenPGP digital signature
Re: A few KVM security questions
Joanna Rutkowska wrote: Avi Kivity wrote: On 12/07/2009 07:09 PM, Joanna Rutkowska wrote: Also, you can use qemu to provide the backends to a Xen PV guest (see -M xenpv). The effect is that you are moving that privileged code from the kernel (netback/blkback) to userspace (qemu -M xenpv). In general, KVM tends to keep code in userspace unless absolutely necessary. That's a fundamental difference from Xen which tends to do the opposite. But the difference is that in case of Xen one can *easily* move the backends to small unprivileged VMs. In that case it doesn't matter the code is in kernel mode, it's still only in an unprivileged domain. They're not really unprivileged, one can easily program the dma controller of their assigned pci card to read and write arbitrary host memory. That's not true if you use VT-d. I'm skeptical that VT-d in its current form provides protection against a malicious guest. The first problem is interrupt delivery. I don't think any hypervisor has really put much thought into mitigating interrupt storms as a DoS. I think there are a number of nasty things that can be done here. Even if you assume that there aren't flaws in VT-d wrt malicious guests, we have generations of hardware that have not been designed to be robust against malicious operating systems. There are almost certainly untold numbers of exploitable hardware bugs that can be used to do all sorts of terrible things to the physical system. VT-d protects against DMA access, but there's still plenty of things a malicious PCI device can do to harm the physical system. I'm sure you could easily program a PCI device to flood the bus which effectively mounts a DoS against other domains. There is no mechanism to arbitrate this today. It's really a dramatically different model from a security perspective. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: A few KVM security questions
Avi Kivity wrote: > On 12/07/2009 07:15 PM, Joanna Rutkowska wrote: But the difference is that in case of Xen one can *easily* move the backends to small unprivileged VMs. In that case it doesn't matter the code is in kernel mode, it's still only in an unprivileged domain. >>> They're not really unprivileged, one can easily program the dma >>> controller of their assigned pci card to read and write arbitrary host >>> memory. >>> >>> >> That's not true if you use VT-d. >> > > AFAIK VT-d is only supported in Xen for fully virtualized guests. Maybe > it changed while I wasn't watching, though. > Negative. VT-d can be used to contain PV DomUs as well. We actually verified it. Sandboxing a process in a monolithic OS, like Linux, is generally considered unfeasible, for anything more complex than a hello world program. The process<-> kernel interface seem to be just too fat. See e.g. the recent Linux kernel overflows by Spender. >>> What about seccomp? You can easily simplify qemu to just a bunch of >>> calculations served over a pipe. >>> >>> >> But the qemu must somehow communicate with the external world too, no? >> You said you provide e.g. net backend via the qemu process... >> > > It can use read() and write() (and shared memory) to communicate, just > like Xen stub domains. > Well, but the read() and write() syscalls, on a system like Linux, it's a gate to *lots* of code. These are very powerful system calls. > It's a lot of surgery, but it can be done. > And then you have the code with whom this qemu communicates (e.g. the network stack). You said we could somehow use IPC to delegate it to some VM (that would have VT-d assigned NIC). But then this VM would need to use qemu again (of course this time not for net emulation). Looks non-trivial. joanna. signature.asc Description: OpenPGP digital signature
Re: A few KVM security questions
Joanna Rutkowska wrote: Anthony Liguori wrote: Avi Kivity wrote: No. Paravirtualization just augments the standard hardware interface, it doesn't replace it as in Xen. NB, unlike Xen, we can (and do) run qemu as non-root. Things like RHEV-H and oVirt constrain the qemu process with SELinux. On Xen you can get rid of the qemu entirely, if you run only PV domains. Also, you can use qemu to provide the backends to a Xen PV guest (see -M xenpv). The effect is that you are moving that privileged code from the kernel (netback/blkback) to userspace (qemu -M xenpv). In general, KVM tends to keep code in userspace unless absolutely necessary. That's a fundamental difference from Xen which tends to do the opposite. But the difference is that in case of Xen one can *easily* move the backends to small unprivileged VMs. In that case it doesn't matter the code is in kernel mode, it's still only in an unprivileged domain. Right, in KVM, Linux == hypervisor. A process is our "unprivileged domain". Putting an unprivileged domain within an unprivileged domain is probably not helpful from a security perspective since the exposure surface is identical. Sandboxing a process in a monolithic OS, like Linux, is generally considered unfeasible, for anything more complex than a hello world program. The process <-> kernel interface seem to be just too fat. See e.g. the recent Linux kernel overflows by Spender. That's the point of mandatory access control. Of course, you need the right policy and Spender highlighted an issue with the standard RHEL SELinux policy, but that should be addressed now upstream. Also, SELinux seems to me like a step into the wrong direction. It not only adds complexity to the already-too-complex kernel, but requires complex configuration. See e.g. this paper[1] for a nice example of how to escape SE-sandboxed qemu on FC8 due to SELinux policy misconfiguration. When some people tried to add SELinux-like-thing to Xen hypervisor, it only resulted in an exploitable heap overflow in Xen [2]. It's certainly fair to argue the merits of SELinux as a mandatory access control mechanism. Again though, that's the point of MLS. Our first line of defense is qemu. Our second line of defense is traditional Posix direct access control. Our third line of defense is namespace isolation (ala lxc). Our fourth line of defense is mandatory access control (ala SELinux and AppArmor). If you take a somewhat standard deployment like RHEV-H, an awful lot of things have to go wrong before you can successfully exploit the system. And 5.4 doesn't even implement all of what's possible. If you're really looking to harden, you can be much more aggressive about privileges and namespace isolation. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: A few KVM security questions
On 12/07/2009 07:15 PM, Joanna Rutkowska wrote: But the difference is that in case of Xen one can *easily* move the backends to small unprivileged VMs. In that case it doesn't matter the code is in kernel mode, it's still only in an unprivileged domain. They're not really unprivileged, one can easily program the dma controller of their assigned pci card to read and write arbitrary host memory. That's not true if you use VT-d. AFAIK VT-d is only supported in Xen for fully virtualized guests. Maybe it changed while I wasn't watching, though. Sandboxing a process in a monolithic OS, like Linux, is generally considered unfeasible, for anything more complex than a hello world program. The process<-> kernel interface seem to be just too fat. See e.g. the recent Linux kernel overflows by Spender. What about seccomp? You can easily simplify qemu to just a bunch of calculations served over a pipe. But the qemu must somehow communicate with the external world too, no? You said you provide e.g. net backend via the qemu process... It can use read() and write() (and shared memory) to communicate, just like Xen stub domains. It's a lot of surgery, but it can be done. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: A few KVM security questions
Avi Kivity wrote: > On 12/07/2009 07:09 PM, Joanna Rutkowska wrote: >> >>> Also, you can use qemu to provide the backends to a Xen PV guest (see -M >>> xenpv). The effect is that you are moving that privileged code from the >>> kernel (netback/blkback) to userspace (qemu -M xenpv). >>> >>> In general, KVM tends to keep code in userspace unless absolutely >>> necessary. That's a fundamental difference from Xen which tends to do >>> the opposite. >>> >>> >> But the difference is that in case of Xen one can *easily* move the >> backends to small unprivileged VMs. In that case it doesn't matter the >> code is in kernel mode, it's still only in an unprivileged domain. >> >> > > They're not really unprivileged, one can easily program the dma > controller of their assigned pci card to read and write arbitrary host > memory. > That's not true if you use VT-d. >> Sandboxing a process in a monolithic OS, like Linux, is generally >> considered unfeasible, for anything more complex than a hello world >> program. The process<-> kernel interface seem to be just too fat. See >> e.g. the recent Linux kernel overflows by Spender. >> > > What about seccomp? You can easily simplify qemu to just a bunch of > calculations served over a pipe. > But the qemu must somehow communicate with the external world too, no? You said you provide e.g. net backend via the qemu process... joanna. signature.asc Description: OpenPGP digital signature
Re: A few KVM security questions
On 12/07/2009 07:09 PM, Joanna Rutkowska wrote: Also, you can use qemu to provide the backends to a Xen PV guest (see -M xenpv). The effect is that you are moving that privileged code from the kernel (netback/blkback) to userspace (qemu -M xenpv). In general, KVM tends to keep code in userspace unless absolutely necessary. That's a fundamental difference from Xen which tends to do the opposite. But the difference is that in case of Xen one can *easily* move the backends to small unprivileged VMs. In that case it doesn't matter the code is in kernel mode, it's still only in an unprivileged domain. They're not really unprivileged, one can easily program the dma controller of their assigned pci card to read and write arbitrary host memory. Sandboxing a process in a monolithic OS, like Linux, is generally considered unfeasible, for anything more complex than a hello world program. The process<-> kernel interface seem to be just too fat. See e.g. the recent Linux kernel overflows by Spender. What about seccomp? You can easily simplify qemu to just a bunch of calculations served over a pipe. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: A few KVM security questions
Anthony Liguori wrote: > Avi Kivity wrote: >> No. Paravirtualization just augments the standard hardware interface, >> it doesn't replace it as in Xen. > > NB, unlike Xen, we can (and do) run qemu as non-root. Things like > RHEV-H and oVirt constrain the qemu process with SELinux. > On Xen you can get rid of the qemu entirely, if you run only PV domains. > Also, you can use qemu to provide the backends to a Xen PV guest (see -M > xenpv). The effect is that you are moving that privileged code from the > kernel (netback/blkback) to userspace (qemu -M xenpv). > > In general, KVM tends to keep code in userspace unless absolutely > necessary. That's a fundamental difference from Xen which tends to do > the opposite. > But the difference is that in case of Xen one can *easily* move the backends to small unprivileged VMs. In that case it doesn't matter the code is in kernel mode, it's still only in an unprivileged domain. Sandboxing a process in a monolithic OS, like Linux, is generally considered unfeasible, for anything more complex than a hello world program. The process <-> kernel interface seem to be just too fat. See e.g. the recent Linux kernel overflows by Spender. Also, SELinux seems to me like a step into the wrong direction. It not only adds complexity to the already-too-complex kernel, but requires complex configuration. See e.g. this paper[1] for a nice example of how to escape SE-sandboxed qemu on FC8 due to SELinux policy misconfiguration. When some people tried to add SELinux-like-thing to Xen hypervisor, it only resulted in an exploitable heap overflow in Xen [2]. [1] http://invisiblethingslab.com/resources/misc08/xenfb-adventures-10.pdf [2] http://invisiblethingslab.com/resources/bh08/part2-full.pdf joanna. signature.asc Description: OpenPGP digital signature
Re: A few KVM security questions
Joanna Rutkowska wrote: Avi Kivity wrote: On 12/07/2009 03:05 PM, Joanna Rutkowska wrote: In particular, is it possible to move the qemu from the host to one of the VMs? Perhaps to have a separate copy of qemu for each VM? (ala Xen's stub-domains) It should be fairly easy to place qemu in a guest. You would leave a simple program on the host to communicate with kvm and pass any data written by the guest to qemu running in another guest, and feed any replies back to the guest. But then you would need to have another qemu (on the host) to support running this "qemu-VM", where we want to put the qemu, right? It really offers no advantage. The security assumption should be that a guest can break into qemu. If a guest can break out of qemu, putting it in another qemu means that we still need to assume it can break out of that qemu. The host should treat the qemu process as hostile and constrain it by using things like -runas, -chroot, SELinux, and containers. This is what most production systems do today. libvirt certainly takes this approach. That's not to say that we know for sure that a guest can break into qemu, but designing around that assumption gives us MLS. Regards, Anthony Liguori joanna. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: A few KVM security questions
Avi Kivity wrote: No. Paravirtualization just augments the standard hardware interface, it doesn't replace it as in Xen. NB, unlike Xen, we can (and do) run qemu as non-root. Things like RHEV-H and oVirt constrain the qemu process with SELinux. Also, you can use qemu to provide the backends to a Xen PV guest (see -M xenpv). The effect is that you are moving that privileged code from the kernel (netback/blkback) to userspace (qemu -M xenpv). In general, KVM tends to keep code in userspace unless absolutely necessary. That's a fundamental difference from Xen which tends to do the opposite. Regards, Anthony Liguori -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Endless loop in qcow2_alloc_cluster_offset
Am 07.12.2009 17:09, schrieb Jan Kiszka: > Kevin Wolf wrote: >> In qcow_aio_write_cb there isn't much happening between these calls. The >> only thing that could somehow become dangerous is the >> qcow_aio_write_cb(req, 0); for queued requests in run_dependent_requests. > > If m->nb_clusters is not, the entry won't be removed from the list. And > of something corrupted nb_clusters so that it became 0 although it's > still enqueued, we would see the deadly loop I faced, right? > Unfortunately, any arbitrary memory corruption that generates such zeros > can cause this... Right, this looks like another way to get into that endless loop. I don't think it's very likely the cause, but who knows. Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Endless loop in qcow2_alloc_cluster_offset
Kevin Wolf wrote: > Am 07.12.2009 15:16, schrieb Jan Kiszka: >>> Likely not. What I did was nothing special, and I did not noticed such a >>> crash in the last months. >> And now it happened again (qemu-kvm head, during kernel installation >> from network onto local qcow2-disk). Any clever idea how to proceed with >> this? > > I still haven't seen this and I still have no theory on what could be > happening here. I'm just trying to write down what I think must happen > to get into this situation. Maybe you can point at something I'm missing > or maybe it helps you to have a sudden inspiration. > > The crash happens because we have a loop in the s->cluster_allocs list. > A loop can only be created by inserting an object twice. The only insert > to this list happens in qcow2_alloc_cluster_offset (though an earlier > call than that of the stack trace). > > There is only one relevant caller of this function, qcow_aio_write_cb. > Part of it is a call to run_dependent_requests which removes the request > from s->cluster_allocs. So after the QLIST_REMOVE in > run_dependent_requests the request can't be contained in the list, but > at the call of qcow2_alloc_cluster_offset it must be contained again. It > must be added somewhere in between these two calls. > > In qcow_aio_write_cb there isn't much happening between these calls. The > only thing that could somehow become dangerous is the > qcow_aio_write_cb(req, 0); for queued requests in run_dependent_requests. If m->nb_clusters is not, the entry won't be removed from the list. And of something corrupted nb_clusters so that it became 0 although it's still enqueued, we would see the deadly loop I faced, right? Unfortunately, any arbitrary memory corruption that generates such zeros can cause this... Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: MSRs load/store
On 12/07/2009 05:32 PM, Jiaqing Du wrote: Hi Avi, I did not get your point. But if we want to multiplex some of the MSRs across the VMM and the guest(s), it would be handy if the hardware provides this feature: save host's version and load guest's version. Of course, we can do this manually. I'm just wondering why this feature is missing. Well, you'll have to ask the designers of the feature. If it can be done manually, why add a feature in hardware? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: MSRs load/store
Hi Avi, I did not get your point. But if we want to multiplex some of the MSRs across the VMM and the guest(s), it would be handy if the hardware provides this feature: save host's version and load guest's version. Of course, we can do this manually. I'm just wondering why this feature is missing. Thanks, Jiaqing 2009/12/7 Avi Kivity : > On 12/07/2009 05:07 PM, Jiaqing Du wrote: >> >> Hi List, >> >> My question is about VM-Exit& VM-Entry controls for MSRs on Intel's >> processors. >> >> For VM-Exit, a VMM can specify lists of MSRs to be stored and loaded >> on VM exits. But for VM-Entry, a VMM can only specify a list of MSRs >> to be loaded on VM entries. Why does not the processor have the >> feature that stores MSRs before loading new ones for VM entries? >> > > Presumably the host knows what values are in those MSRs, so it doesn't need > to store them. > > -- > error compiling committee.c: too many arguments to function > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Endless loop in qcow2_alloc_cluster_offset
Kevin Wolf wrote: > Am 07.12.2009 15:50, schrieb Jan Kiszka: >> Jan Kiszka wrote: >>> And now it happened again (qemu-kvm head, during kernel installation >>> from network onto local qcow2-disk). Any clever idea how to proceed with >>> this? >>> >>> I could try to run the step in a loop, hopefully retriggering it once in >>> a (likely longer) while. But then we need some good instrumentation first. >>> >> Maybe I'm seeing ghosts, and I don't even have a minimal clue about what >> goes on in the code, but this looks fishy: >> >> preallocate() invokes qcow2_alloc_cluster_offset() passing &meta, a >> stack variable. It seems that qcow2_alloc_cluster_offset() may insert >> this structure into cluster_allocs and leave it there. So we corrupt the >> queue as soon as preallocate() returns, no? > > preallocate() is about metadata preallocation during image creation. It > is only ever run by qemu-img. Apart from that it calls > run_dependent_requests() which removes the request from the list again. OK, I see - was far too easy anyway. Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: MSRs load/store
On 12/07/2009 05:07 PM, Jiaqing Du wrote: Hi List, My question is about VM-Exit& VM-Entry controls for MSRs on Intel's processors. For VM-Exit, a VMM can specify lists of MSRs to be stored and loaded on VM exits. But for VM-Entry, a VMM can only specify a list of MSRs to be loaded on VM entries. Why does not the processor have the feature that stores MSRs before loading new ones for VM entries? Presumably the host knows what values are in those MSRs, so it doesn't need to store them. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
MSRs load/store
Hi List, My question is about VM-Exit & VM-Entry controls for MSRs on Intel's processors. For VM-Exit, a VMM can specify lists of MSRs to be stored and loaded on VM exits. But for VM-Entry, a VMM can only specify a list of MSRs to be loaded on VM entries. Why does not the processor have the feature that stores MSRs before loading new ones for VM entries? Thanks, Jiaqing -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Endless loop in qcow2_alloc_cluster_offset
Am 07.12.2009 15:50, schrieb Jan Kiszka: > Jan Kiszka wrote: >> And now it happened again (qemu-kvm head, during kernel installation >> from network onto local qcow2-disk). Any clever idea how to proceed with >> this? >> >> I could try to run the step in a loop, hopefully retriggering it once in >> a (likely longer) while. But then we need some good instrumentation first. >> > > Maybe I'm seeing ghosts, and I don't even have a minimal clue about what > goes on in the code, but this looks fishy: > > preallocate() invokes qcow2_alloc_cluster_offset() passing &meta, a > stack variable. It seems that qcow2_alloc_cluster_offset() may insert > this structure into cluster_allocs and leave it there. So we corrupt the > queue as soon as preallocate() returns, no? preallocate() is about metadata preallocation during image creation. It is only ever run by qemu-img. Apart from that it calls run_dependent_requests() which removes the request from the list again. Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Endless loop in qcow2_alloc_cluster_offset
On 12/07/2009 04:50 PM, Jan Kiszka wrote: Maybe I'm seeing ghosts, and I don't even have a minimal clue about what goes on in the code, but this looks fishy: Plenty of ghosts in qcow2, of all those explorers who tried to brave the code. Only Kevin has ever come back. preallocate() invokes qcow2_alloc_cluster_offset() passing&meta, a stack variable. It seems that qcow2_alloc_cluster_offset() may insert this structure into cluster_allocs and leave it there. So we corrupt the queue as soon as preallocate() returns, no? We invoke run_dependent_requests() which should dequeue those &meta again (I think). -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Endless loop in qcow2_alloc_cluster_offset
Am 07.12.2009 15:16, schrieb Jan Kiszka: >> Likely not. What I did was nothing special, and I did not noticed such a >> crash in the last months. > > And now it happened again (qemu-kvm head, during kernel installation > from network onto local qcow2-disk). Any clever idea how to proceed with > this? I still haven't seen this and I still have no theory on what could be happening here. I'm just trying to write down what I think must happen to get into this situation. Maybe you can point at something I'm missing or maybe it helps you to have a sudden inspiration. The crash happens because we have a loop in the s->cluster_allocs list. A loop can only be created by inserting an object twice. The only insert to this list happens in qcow2_alloc_cluster_offset (though an earlier call than that of the stack trace). There is only one relevant caller of this function, qcow_aio_write_cb. Part of it is a call to run_dependent_requests which removes the request from s->cluster_allocs. So after the QLIST_REMOVE in run_dependent_requests the request can't be contained in the list, but at the call of qcow2_alloc_cluster_offset it must be contained again. It must be added somewhere in between these two calls. In qcow_aio_write_cb there isn't much happening between these calls. The only thing that could somehow become dangerous is the qcow_aio_write_cb(req, 0); for queued requests in run_dependent_requests. > I could try to run the step in a loop, hopefully retriggering it once in > a (likely longer) while. But then we need some good instrumentation first. I can't explain what exactly would be going wrong there, but if my thoughts are right so far, I think that moving this into a Bottom Half would help. So if you can reproduce it in a loop this could be worth a try. I'd certainly prefer to understand the problem first, but thinking about AIO is the perfect way to make your brain hurt... Kevin -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Endless loop in qcow2_alloc_cluster_offset
Jan Kiszka wrote: > And now it happened again (qemu-kvm head, during kernel installation > from network onto local qcow2-disk). Any clever idea how to proceed with > this? > > I could try to run the step in a loop, hopefully retriggering it once in > a (likely longer) while. But then we need some good instrumentation first. > Maybe I'm seeing ghosts, and I don't even have a minimal clue about what goes on in the code, but this looks fishy: preallocate() invokes qcow2_alloc_cluster_offset() passing &meta, a stack variable. It seems that qcow2_alloc_cluster_offset() may insert this structure into cluster_allocs and leave it there. So we corrupt the queue as soon as preallocate() returns, no? Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Endless loop in qcow2_alloc_cluster_offset
Jan Kiszka wrote: > Kevin Wolf wrote: >> Hi Jan, >> >> Am 19.11.2009 13:19, schrieb Jan Kiszka: >>> (gdb) print ((BDRVQcowState *)bs->opaque)->cluster_allocs.lh_first >>> $5 = (struct QCowL2Meta *) 0xcb3568 >>> (gdb) print *((BDRVQcowState *)bs->opaque)->cluster_allocs.lh_first >>> $6 = {offset = 7417176064, n_start = 0, nb_available = 16, nb_clusters = 0, >>> depends_on = 0xcb3568, dependent_requests = {lh_first = 0x0}, >>> next_in_flight = {le_next = 0xcb3568, le_prev = 0xc4ebd8}} >>> >>> So next == first. >> Oops. Doesn't sound quite right... >> >>> Is something fiddling with cluster_allocs concurrently, e.g. some signal >>> handler? Or what could cause this list corruption? Would it be enough to >>> move to QLIST_FOREACH_SAFE? >> Are there any specific signals you're thinking of? Related to block code > > No, was just blind guessing. > >> I can only think of SIGUSR2 and this one shouldn't call any block driver >> functions directly. You're using aio=threads, I assume? (It's the default) > > Yes, all on defaults. > >> QLIST_FOREACH_SAFE shouldn't make a difference in this place as the loop >> doesn't insert or remove any elements. If the list is corrupted now, I >> think it would be corrupted with QLIST_FOREACH_SAFE as well - at best, >> the endless loop would occur one call later. >> >> The only way I see to get such a loop in a list is to re-insert an >> element that already is part of the list. The only insert is at >> qcow2-cluster.c:777. Remains the question how we came there twice >> without run_dependent_requests() removing the L2Meta from our list first >> - because this is definitely wrong... >> >> Presumably, it's not reproducible? > > Likely not. What I did was nothing special, and I did not noticed such a > crash in the last months. And now it happened again (qemu-kvm head, during kernel installation from network onto local qcow2-disk). Any clever idea how to proceed with this? I could try to run the step in a loop, hopefully retriggering it once in a (likely longer) while. But then we need some good instrumentation first. Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: A few KVM security questions
On 12/07/2009 04:06 PM, Joanna Rutkowska wrote: Can you point to a document/source file that would list all the possible interfaces between VM and the host? I.e. all the VMX handlers, and all the hypercalls (PV interfaces). arch/x86/kvm/vmx.c is the entry point for all interaction, but it quickly diverges. arch/x86/include/asm/kvm_para.h is the pv interface. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: A few KVM security questions
Avi Kivity wrote: > On 12/07/2009 03:30 PM, Joanna Rutkowska wrote: >> Avi Kivity wrote: >> >> 1) Do you have any support for para-virtualized VMs? >>> Yes, for example, we support paravirtualized timers and mmu for Linux. >>> These are fairly minimal compared to Xen's pv domains. >>> >>> >> Can I run a regular Linux as PV-guest? Specifically, can I get rid of >> qemu totally, assuming I have only PV guests? >> >> > > No. Paravirtualization just augments the standard hardware interface, > it doesn't replace it as in Xen. > Can you point to a document/source file that would list all the possible interfaces between VM and the host? I.e. all the VMX handlers, and all the hypercalls (PV interfaces). joanna. signature.asc Description: OpenPGP digital signature
Re: A few KVM security questions
On 12/07/2009 03:55 PM, Joanna Rutkowska wrote: It should be fairly easy to place qemu in a guest. You would leave a simple program on the host to communicate with kvm and pass any data written by the guest to qemu running in another guest, and feed any replies back to the guest. But then you would need to have another qemu (on the host) to support running this "qemu-VM", where we want to put the qemu, right? Right, but to exploit this, you'd have to exploit the internal qemu, exploit the kernel, and exploit the external qemu. Well, if the exploit was in some central thing I guess there isn't much value in the nesting. You could alternatively use Xenner to run Xen guests for your qemu. That emulates a lot less. But AFAIK xenner is moving towards qemu as well. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: A few KVM security questions
Avi Kivity wrote: > On 12/07/2009 03:05 PM, Joanna Rutkowska wrote: >> In particular, is >> it possible to move the qemu from the host to one of the VMs? Perhaps to >> have a separate copy of qemu for each VM? (ala Xen's stub-domains) >> > > It should be fairly easy to place qemu in a guest. You would leave a > simple program on the host to communicate with kvm and pass any data > written by the guest to qemu running in another guest, and feed any > replies back to the guest. > But then you would need to have another qemu (on the host) to support running this "qemu-VM", where we want to put the qemu, right? joanna. signature.asc Description: OpenPGP digital signature
Re: A few KVM security questions
On 12/07/2009 03:30 PM, Joanna Rutkowska wrote: Avi Kivity wrote: 1) Do you have any support for para-virtualized VMs? Yes, for example, we support paravirtualized timers and mmu for Linux. These are fairly minimal compared to Xen's pv domains. Can I run a regular Linux as PV-guest? Specifically, can I get rid of qemu totally, assuming I have only PV guests? No. Paravirtualization just augments the standard hardware interface, it doesn't replace it as in Xen. E.g. do you have PV network and disk frontends (PV drivers), that I could use on guests and that do not use qemu at all? We do have PV network and disk frontends, but the backends (devices) are still in qemu. Should be doable by assigning the NIC to a driver domain and bridging it to a virtio driver; then have the driver domain's virtio device talk to the ordinary guests. But bridging would still require to have some networking support (+net backends) on the host (sure, without any real NIC driver, but still), correct? If you were willing to hack a bit, you can use any IPC to pass the packets instead of the networking stack (for example, shared memory + eventfd for signalling). 4) Do you have some method of excluding particular PCI devices from being initialized by your host Linux? E.g. those devices that are later to be assigned to some VMs (via VT-d passthrough)? Yes, there is a stub driver that does this. Does this stub driver sets DMA protections, so that the device in question cannot access any host memory? That is important, because once you assigned a device to some VM, we should assume the VM might have somehow compromised the device, e.g. reflashed the firmware of the NIC, perhaps. So, it's important to be able to protect the hypervisor from such devices. kvm places assigned devices in an iommu protection domain so it cannot attack the host. Once the guest stops using the device, we reset it. If the guest is able to upload a malicious, persistent payload to the device, then when the device is reused whoever uses it will be vulnerable (whether a new guest or the host). -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: A few KVM security questions
Avi Kivity wrote: >> 1) Do you have any support for para-virtualized VMs? > > Yes, for example, we support paravirtualized timers and mmu for Linux. > These are fairly minimal compared to Xen's pv domains. > Can I run a regular Linux as PV-guest? Specifically, can I get rid of qemu totally, assuming I have only PV guests? E.g. do you have PV network and disk frontends (PV drivers), that I could use on guests and that do not use qemu at all? >> 2) Is it possible to have driver domains in KVM? E.g. I would like to >> assign my NIC to one VM (a "network domain") and then I would like other >> domains to use this network domain for networking. In case of Xen, this >> is done by moving the network backend (which is not qemu BTW) into the >> network domain, and configuring the network frontends in other VMs to >> talk to this network domain's backend, rather then to Dom0's backend (in >> fact you can get rid of all the networking in Dom0). >> > > Should be doable by assigning the NIC to a driver domain and bridging it > to a virtio driver; then have the driver domain's virtio device talk to > the ordinary guests. But bridging would still require to have some networking support (+net backends) on the host (sure, without any real NIC driver, but still), correct? >> 4) Do you have some method of excluding particular PCI devices from >> being initialized by your host Linux? E.g. those devices that are later >> to be assigned to some VMs (via VT-d passthrough)? > > Yes, there is a stub driver that does this. > Does this stub driver sets DMA protections, so that the device in question cannot access any host memory? That is important, because once you assigned a device to some VM, we should assume the VM might have somehow compromised the device, e.g. reflashed the firmware of the NIC, perhaps. So, it's important to be able to protect the hypervisor from such devices. Thanks, joanna. signature.asc Description: OpenPGP digital signature
Re: A few KVM security questions
On 12/07/2009 03:05 PM, Joanna Rutkowska wrote: Hello, I have the following questions regarding the KVM architecture. I looked at the slides available at linux-kvm.org, but didn't find definitive answers. I'm also interested to learn if given feature is or is not planned for the near future. The questions follow: 1) Do you have any support for para-virtualized VMs? Yes, for example, we support paravirtualized timers and mmu for Linux. These are fairly minimal compared to Xen's pv domains. In particular, is it possible to move the qemu from the host to one of the VMs? Perhaps to have a separate copy of qemu for each VM? (ala Xen's stub-domains) It should be fairly easy to place qemu in a guest. You would leave a simple program on the host to communicate with kvm and pass any data written by the guest to qemu running in another guest, and feed any replies back to the guest. It should also be possible to constrain qemu using SECCOMP. None of this has been attempted to my knowledge. 2) Is it possible to have driver domains in KVM? E.g. I would like to assign my NIC to one VM (a "network domain") and then I would like other domains to use this network domain for networking. In case of Xen, this is done by moving the network backend (which is not qemu BTW) into the network domain, and configuring the network frontends in other VMs to talk to this network domain's backend, rather then to Dom0's backend (in fact you can get rid of all the networking in Dom0). Should be doable by assigning the NIC to a driver domain and bridging it to a virtio driver; then have the driver domain's virtio device talk to the ordinary guests. 3) Do you have any support for TXT-based trusted boot? I guess you indirectly have via tboot. However, how do you deal with VT-d protections? The tboot.gz should normally DMA-protect memory before handing execution over to Linux kernel. But then you need to allow your drivers to work. Do you unprotect all the memory for DMA, or do you have some support for selectively unprotect only those regions of memory which are needed by (some) drivers? If the latter, how do you determine which memory should be DMA-unprotected? I know nothing about tboot. 4) Do you have some method of excluding particular PCI devices from being initialized by your host Linux? E.g. those devices that are later to be assigned to some VMs (via VT-d passthrough)? Yes, there is a stub driver that does this. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
A few KVM security questions
Hello, I have the following questions regarding the KVM architecture. I looked at the slides available at linux-kvm.org, but didn't find definitive answers. I'm also interested to learn if given feature is or is not planned for the near future. The questions follow: 1) Do you have any support for para-virtualized VMs? In particular, is it possible to move the qemu from the host to one of the VMs? Perhaps to have a separate copy of qemu for each VM? (ala Xen's stub-domains) 2) Is it possible to have driver domains in KVM? E.g. I would like to assign my NIC to one VM (a "network domain") and then I would like other domains to use this network domain for networking. In case of Xen, this is done by moving the network backend (which is not qemu BTW) into the network domain, and configuring the network frontends in other VMs to talk to this network domain's backend, rather then to Dom0's backend (in fact you can get rid of all the networking in Dom0). 3) Do you have any support for TXT-based trusted boot? I guess you indirectly have via tboot. However, how do you deal with VT-d protections? The tboot.gz should normally DMA-protect memory before handing execution over to Linux kernel. But then you need to allow your drivers to work. Do you unprotect all the memory for DMA, or do you have some support for selectively unprotect only those regions of memory which are needed by (some) drivers? If the latter, how do you determine which memory should be DMA-unprotected? 4) Do you have some method of excluding particular PCI devices from being initialized by your host Linux? E.g. those devices that are later to be assigned to some VMs (via VT-d passthrough)? Thanks, I would appreciate any answers. Please note I'm not subscribed to the list, so won't get your response if sent only to the list. Regards, joanna. -- Joanna Rutkowska Founder/CEO Invisible Things Lab http://invisiblethingslab.com/ signature.asc Description: OpenPGP digital signature
Re: [Autotest][PATCH 1/2] add hackbench test to kvm autotest
FYI, this was already incorporated to the tree, thanks Sudhir! On Fri, 2009-12-04 at 11:19 +0530, sudhir kumar wrote: > This patch adds the hackbench test for the KVM linux guests. > > Signed-off-by: Sudhir Kumar > > Index: kvm/autotest_control/hackbench.control > === > --- /dev/null > +++ kvm/autotest_control/hackbench.control > @@ -0,0 +1,13 @@ > +AUTHOR = "Sudhir Kumar " > +NAME = "Hackbench" > +TIME = "SHORT" > +TEST_CLASS = "Kernel" > +TEST_CATEGORY = "Benchmark" > +TEST_TYPE = "client" > + > +DOC = """ > +Hackbench is a benchmark which measures the performance, overhead and > +scalability of the Linux scheduler. > + > +""" > +job.run_test('hackbench') > > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/4] KVM: Add accessor for reading cr4 (or some bits of cr4)
Some bits of cr4 can be owned by the guest on vmx, so when we read them, we copy them to the vcpu structure. In preparation for making the set of guest-owned bits dynamic, use helpers to access these bits so we don't need to know where the bit resides. No changes to svm since all bits are host-owned there. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h |1 + arch/x86/kvm/kvm_cache_regs.h | 12 arch/x86/kvm/mmu.h |5 +++-- arch/x86/kvm/vmx.c | 13 - arch/x86/kvm/x86.c | 16 ++-- 5 files changed, 30 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index da6dee8..e9f4f12 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -272,6 +272,7 @@ struct kvm_vcpu_arch { unsigned long cr2; unsigned long cr3; unsigned long cr4; + unsigned long cr4_guest_owned_bits; unsigned long cr8; u32 hflags; u64 pdptrs[4]; /* pae */ diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 7bcc5b6..35acc36 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -38,4 +38,16 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.pdptrs[index]; } +static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + if (mask & vcpu->arch.cr4_guest_owned_bits) + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + return vcpu->arch.cr4 & mask; +} + +static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, ~0UL); +} + #endif diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 61a1b38..4567d80 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -2,6 +2,7 @@ #define __KVM_X86_MMU_H #include +#include "kvm_cache_regs.h" #define PT64_PT_BITS 9 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -64,12 +65,12 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) static inline int is_pae(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr4 & X86_CR4_PAE; + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); } static inline int is_pse(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr4 & X86_CR4_PSE; + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); } static inline int is_paging(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5ef820e..ae95a0c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1612,8 +1612,10 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu) static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~cr4_guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; } static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -1658,7 +1660,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1666,7 +1668,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } if (!(cr0 & X86_CR0_WP)) @@ -2417,6 +2419,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_GUEST_CR4_MASK; tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; rdtscll(tsc_this); @@ -3047,7 +3050,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) vcpu->arch.eff_db[dr] = val; break; case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) kvm_queue_exception(vcpu, UD_VECTOR); break; case 6: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dd15d7a..4a16337 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -481,7 +481,7 @@ EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr
[PATCH 3/4] KVM: VMX: Make guest cr4 mask more conservative
Instead of specifying the bits which we want to trap on, specify the bits which we allow the guest to change transparently. This is safer wrt future changes to cr4. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 10 ++ 1 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ae95a0c..d34fdd3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -69,8 +69,10 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_GUEST_CR4_MASK \ - (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) +#define KVM_CR4_GUEST_OWNED_BITS \ + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ +| X86_CR4_OSXMMEXCPT) + #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) @@ -2418,8 +2420,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); - vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_GUEST_CR4_MASK; + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; rdtscll(tsc_this); -- 1.6.5.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/4] cr4 optimizations for vmx/ept
When ept is enabled, we aren't particularly interested in cr4.pge, so allow the guest to own it. This improves performance in vmap() intensive loads. Avi Kivity (4): KVM: VMX: Move some cr[04] related constants to vmx.c KVM: Add accessor for reading cr4 (or some bits of cr4) KVM: VMX: Make guest cr4 mask more conservative KVM: VMX: When using ept, allow the guest to own cr4.pge arch/x86/include/asm/kvm_host.h | 14 +- arch/x86/kvm/kvm_cache_regs.h | 12 arch/x86/kvm/mmu.h |5 +++-- arch/x86/kvm/vmx.c | 32 ++-- arch/x86/kvm/x86.c | 16 ++-- 5 files changed, 48 insertions(+), 31 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/4] KVM: VMX: When using ept, allow the guest to own cr4.pge
We make no use of cr4.pge if ept is enabled, but the guest does (to flush global mappings, as with vmap()), so give the guest ownership of this bit. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d34fdd3..2e47e65 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2421,6 +2421,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; + if (enable_ept) + vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; -- 1.6.5.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/4] KVM: VMX: Move some cr[04] related constants to vmx.c
They have no place in common code. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 13 - arch/x86/kvm/vmx.c | 13 + 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4f865e8..da6dee8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -38,19 +38,6 @@ #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFF00ULL) -#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) -#define KVM_GUEST_CR0_MASK \ - (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_GUEST_CR4_MASK \ - (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) - #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9a0a2cf..5ef820e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -61,6 +61,19 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK \ + (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) +#define KVM_VM_CR0_ALWAYS_ON \ + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_GUEST_CR4_MASK \ + (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap:upper bound on the amount of time between two successive -- 1.6.5.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Autotest] [PATCH] Add a server-side test - kvm_migration
Resending with proper cc list :( On Mon, Dec 7, 2009 at 2:43 PM, sudhir kumar wrote: > Thanks for initiating the server side implementation of migration. Few > comments below > > On Fri, Dec 4, 2009 at 1:48 PM, Yolkfull Chow wrote: >> This patch will add a server-side test namely kvm_migration. Currently, >> it will use existing KVM client test framework and add a new file >> kvm_migration.py to help judge executing routine: source machine or dest >> machine. >> >> * One thing need to be considered/improved: >> Whether we parse the kvm_tests.cfg on server machine or on client machines? >> If parse it on client machines, we need to fix one problem that adding >> 'start_vm_for_migration' parameter into dict which generated on dest machine. > I think we can not manage with client side parsing without adding too > much complexity. So let us continue parsing on the server side only > for remote migration. Also as the patch does, keep the local migration > under the client also. I do not like adding test variants in > migration_control.srv. Comments below... >> >> So far I choose parsing kvm_tests.cfg on server machine, and then add >> 'start_vm_for_migration' into dict cloned from original test dict for dest >> machine. >> >> * In order to run this test so far, we need to setup NFS for both >> source and dest machines. >> >> Signed-off-by: Yolkfull Chow >> --- >> client/tests/kvm/kvm_migration.py | 165 >> >> client/tests/kvm/kvm_test_utils.py | 27 +++--- >> client/tests/kvm/kvm_tests.cfg.sample | 2 + >> client/tests/kvm_migration | 1 + >> server/tests/kvm/migration_control.srv | 137 ++ >> 5 files changed, 320 insertions(+), 12 deletions(-) >> create mode 100644 client/tests/kvm/kvm_migration.py >> create mode 12 client/tests/kvm_migration >> create mode 100644 server/tests/kvm/migration_control.srv >> >> diff --git a/client/tests/kvm/kvm_migration.py >> b/client/tests/kvm/kvm_migration.py >> new file mode 100644 >> index 000..52cd3cd >> --- /dev/null >> +++ b/client/tests/kvm/kvm_migration.py >> @@ -0,0 +1,165 @@ >> +import sys, os, time, logging, commands, socket >> +from autotest_lib.client.bin import test >> +from autotest_lib.client.common_lib import error >> +import kvm_utils, kvm_preprocessing, common, kvm_vm, kvm_test_utils >> + >> + >> +class kvm_migration(test.test): >> + """ >> + KVM migration test. >> + >> + �...@copyright: Red Hat 2008-2009 >> + �...@see: http://www.linux-kvm.org/page/KVM-Autotest/Client_Install >> + (Online doc - Getting started with KVM testing) >> + >> + Migration execution progress: >> + >> + source host dest host >> + -- >> + log into guest >> + -- >> + start socket server >> + >> + wait 30 secs -- wait login_timeout+30 secs--- >> + >> + accept connection connect to socket server,send mig_port >> + -- >> + start migration >> + >> + wait 30 secs -- wait mig_timeout+30 secs- >> + >> + try to log into migrated guest >> + -- >> + >> + """ >> + version = 1 >> + def initialize(self): >> + pass >> + >> + >> + def run_once(self, params): >> + """ >> + Setup remote machine and then execute migration. >> + """ >> + # Check whether remote machine is ready >> + dsthost = params.get("dsthost") >> + srchost = params.get("srchost") >> + image_path = os.path.join(self.bindir, "images") >> + >> + rootdir = params.get("rootdir") >> + iso = os.path.join(rootdir, 'iso') >> + images = os.path.join(rootdir, 'images') >> + qemu = os.path.join(rootdir, 'qemu') >> + qemu_img = os.path.join(rootdir, 'qemu-img') >> + >> + def link_if_not_exist(ldir, target, link_name): >> + t = target >> + l = os.path.join(ldir, link_name) >> + if not os.path.exists(l): >> + os.symlink(t,l) >> + link_if_not_exist(self.bindir, '../../', 'autotest') >> + link_if_not_exist(self.bindir, iso, 'isos') >> + link_if_not_exist(self.bindir, images, 'images') >> + link_if_not_exist(self.bindir, qemu, 'qemu') >> + link_if_not_exist(self.bindir, qemu_img, 'qemu-img') >> + >> + # Report the parameters we've received and write them as keyvals >> + logging.debug("Test parameters:") >> + keys = params.keys() >> + keys.sort() >> + for key in keys: >> + logging.debug(" %s = %s", key, params[key]) >> + self.write_test_keyval({key: params[key]}) >> + >> + # Ope
[PATCH] KVM: Fix possible circular locking in kvm_vm_ioctl_assign_device()
One possible order is: KVM_CREATE_IRQCHIP ioctl(took kvm->lock) -> kvm_iobus_register_dev() -> down_write(kvm->slots_lock). The other one is in kvm_vm_ioctl_assign_device(), which take kvm->slots_lock first, then kvm->lock. Observe it due to kernel locking debug warnings. Signed-off-by: Sheng Yang --- virt/kvm/assigned-dev.c |6 +++--- 1 files changed, 3 insertions(+), 3 deletions(-) diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index fd9c097..f73de63 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -508,8 +508,8 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, struct kvm_assigned_dev_kernel *match; struct pci_dev *dev; - down_read(&kvm->slots_lock); mutex_lock(&kvm->lock); + down_read(&kvm->slots_lock); match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, assigned_dev->assigned_dev_id); @@ -573,8 +573,8 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, } out: - mutex_unlock(&kvm->lock); up_read(&kvm->slots_lock); + mutex_unlock(&kvm->lock); return r; out_list_del: list_del(&match->list); @@ -585,8 +585,8 @@ out_put: pci_dev_put(dev); out_free: kfree(match); - mutex_unlock(&kvm->lock); up_read(&kvm->slots_lock); + mutex_unlock(&kvm->lock); return r; } -- 1.5.4.5 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html