[PATCH 3/5] target-i386: kvm: Simplify MSR array construction

2015-12-16 Thread Eduardo Habkost
Add a helper function that appends new entries to the MSR buffer
and checks for the buffer size limit.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/kvm.c | 262 ++
 1 file changed, 125 insertions(+), 137 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 1e82400..ada484f 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1359,6 +1359,18 @@ static void kvm_msr_buf_reset(X86CPU *cpu)
 memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
 }
 
+static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value)
+{
+struct kvm_msrs *msrs = cpu->kvm_msr_buf;
+void *limit = ((void*)msrs) + MSR_BUF_SIZE;
+struct kvm_msr_entry *entry = >entries[msrs->nmsrs];
+
+assert((void*)(entry + 1) <= limit);
+
+kvm_msr_entry_set(entry, index, value);
+msrs->nmsrs++;
+}
+
 static int kvm_put_tscdeadline_msr(X86CPU *cpu)
 {
 CPUX86State *env = >env;
@@ -1407,46 +1419,45 @@ static int kvm_put_msr_feature_control(X86CPU *cpu)
 static int kvm_put_msrs(X86CPU *cpu, int level)
 {
 CPUX86State *env = >env;
-struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
-int n = 0, i;
+int i;
 
 kvm_msr_buf_reset(cpu);
 
-kvm_msr_entry_set([n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
-kvm_msr_entry_set([n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
-kvm_msr_entry_set([n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
-kvm_msr_entry_set([n++], MSR_PAT, env->pat);
+kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs);
+kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
+kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
+kvm_msr_entry_add(cpu, MSR_PAT, env->pat);
 if (has_msr_star) {
-kvm_msr_entry_set([n++], MSR_STAR, env->star);
+kvm_msr_entry_add(cpu, MSR_STAR, env->star);
 }
 if (has_msr_hsave_pa) {
-kvm_msr_entry_set([n++], MSR_VM_HSAVE_PA, env->vm_hsave);
+kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave);
 }
 if (has_msr_tsc_aux) {
-kvm_msr_entry_set([n++], MSR_TSC_AUX, env->tsc_aux);
+kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux);
 }
 if (has_msr_tsc_adjust) {
-kvm_msr_entry_set([n++], MSR_TSC_ADJUST, env->tsc_adjust);
+kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust);
 }
 if (has_msr_misc_enable) {
-kvm_msr_entry_set([n++], MSR_IA32_MISC_ENABLE,
+kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE,
   env->msr_ia32_misc_enable);
 }
 if (has_msr_smbase) {
-kvm_msr_entry_set([n++], MSR_IA32_SMBASE, env->smbase);
+kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase);
 }
 if (has_msr_bndcfgs) {
-kvm_msr_entry_set([n++], MSR_IA32_BNDCFGS, env->msr_bndcfgs);
+kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs);
 }
 if (has_msr_xss) {
-kvm_msr_entry_set([n++], MSR_IA32_XSS, env->xss);
+kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss);
 }
 #ifdef TARGET_X86_64
 if (lm_capable_kernel) {
-kvm_msr_entry_set([n++], MSR_CSTAR, env->cstar);
-kvm_msr_entry_set([n++], MSR_KERNELGSBASE, env->kernelgsbase);
-kvm_msr_entry_set([n++], MSR_FMASK, env->fmask);
-kvm_msr_entry_set([n++], MSR_LSTAR, env->lstar);
+kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar);
+kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase);
+kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask);
+kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar);
 }
 #endif
 /*
@@ -1454,106 +1465,89 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
  * for normal writeback. Limit them to reset or full state updates.
  */
 if (level >= KVM_PUT_RESET_STATE) {
-kvm_msr_entry_set([n++], MSR_IA32_TSC, env->tsc);
-kvm_msr_entry_set([n++], MSR_KVM_SYSTEM_TIME,
-  env->system_time_msr);
-kvm_msr_entry_set([n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
+kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
+kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
+kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
 if (has_msr_async_pf_en) {
-kvm_msr_entry_set([n++], MSR_KVM_ASYNC_PF_EN,
-  env->async_pf_en_msr);
+kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
 }
 if (has_msr_pv_eoi_en) {
-kvm_msr_entry_set([n++], MSR_KVM_PV_EOI_EN,
-  env->pv_eoi_en_msr);
+kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr);
 }
 if (has_msr_kvm_steal_time) {
-   

[PATCH 4/5] target-i386: kvm: Simplify MSR setting functions

2015-12-16 Thread Eduardo Habkost
Simplify kvm_put_tscdeadline_msr() and
kvm_put_msr_feature_control() using kvm_msr_buf and the
kvm_msr_entry_add() helper.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/kvm.c | 28 ++--
 1 file changed, 6 insertions(+), 22 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index ada484f..3550866 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1374,23 +1374,15 @@ static void kvm_msr_entry_add(X86CPU *cpu, uint32_t 
index, uint64_t value)
 static int kvm_put_tscdeadline_msr(X86CPU *cpu)
 {
 CPUX86State *env = >env;
-struct {
-struct kvm_msrs info;
-struct kvm_msr_entry entries[1];
-} msr_data;
-struct kvm_msr_entry *msrs = msr_data.entries;
 
 if (!has_msr_tsc_deadline) {
 return 0;
 }
 
-kvm_msr_entry_set([0], MSR_IA32_TSCDEADLINE, env->tsc_deadline);
-
-msr_data.info = (struct kvm_msrs) {
-.nmsrs = 1,
-};
+kvm_msr_buf_reset(cpu);
+kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline);
 
-return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, _data);
+return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
 }
 
 /*
@@ -1401,19 +1393,11 @@ static int kvm_put_tscdeadline_msr(X86CPU *cpu)
  */
 static int kvm_put_msr_feature_control(X86CPU *cpu)
 {
-struct {
-struct kvm_msrs info;
-struct kvm_msr_entry entry;
-} msr_data;
-
-kvm_msr_entry_set(_data.entry, MSR_IA32_FEATURE_CONTROL,
+kvm_msr_buf_reset(cpu);
+kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL,
   cpu->env.msr_ia32_feature_control);
 
-msr_data.info = (struct kvm_msrs) {
-.nmsrs = 1,
-};
-
-return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, _data);
+return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
 }
 
 static int kvm_put_msrs(X86CPU *cpu, int level)
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/5] target-i386: kvm: Eliminate kvm_msr_entry_set()

2015-12-16 Thread Eduardo Habkost
Inline the function inside kvm_msr_entry_add().

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/kvm.c | 12 +++-
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 3550866..b328392 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1346,14 +1346,6 @@ static int kvm_put_sregs(X86CPU *cpu)
 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, );
 }
 
-static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
-  uint32_t index, uint64_t value)
-{
-entry->index = index;
-entry->reserved = 0;
-entry->data = value;
-}
-
 static void kvm_msr_buf_reset(X86CPU *cpu)
 {
 memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
@@ -1367,7 +1359,9 @@ static void kvm_msr_entry_add(X86CPU *cpu, uint32_t 
index, uint64_t value)
 
 assert((void*)(entry + 1) <= limit);
 
-kvm_msr_entry_set(entry, index, value);
+entry->index = index;
+entry->reserved = 0;
+entry->data = value;
 msrs->nmsrs++;
 }
 
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/5] target-i386: kvm: Increase MSR_BUF_SIZE

2015-12-16 Thread Eduardo Habkost
We are dangerously close to the array limits in kvm_put_msrs()
and kvm_get_msrs(): with the default mcg_cap configuration, we
can set up to 148 MSRs in kvm_put_msrs(), and if we allow mcg_cap
to be changed, we can write up to 236 MSRs.

Use 4096 bytes for the buffer, that can hold 255 kvm_msr_entry
structs.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/kvm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 660b2d9..1e82400 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -52,8 +52,9 @@
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
-#define MSR_BUF_SIZE \
-(sizeof(struct kvm_msrs) + 150 * sizeof(struct kvm_msr_entry))
+/* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
+ * 255 kvm_msr_entry structs */
+#define MSR_BUF_SIZE 4096
 
 #ifndef BUS_MCEERR_AR
 #define BUS_MCEERR_AR 4
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/5] target-i386: kvm: Increase MSR entry array limits, check for array overrun

2015-12-16 Thread Eduardo Habkost
We are dangerously close to the array limits in kvm_put_msrs()
and kvm_get_msrs(): with the default mcg_cap configuration, we
can set up to 148 MSRs in kvm_put_msrs(), and if we allow mcg_cap
to be changed, we can write up to 236 MSRs[1].

This series changes the code to allocate a buffer once per VCPU,
increase buffer size to 4096 bytes (that can hold up to 255 MSR
entries), and check array limits before appending new entries.

[1] I have checked the limits by copying and pasting the
kvm_put_msrs() code to a new file, replacing the "if" lines,
copying the macro definitions, and adding a helper macro to
keep track of the kvm_msr_entry_set() calls. The code can be
seen at:
https://gist.github.com/ehabkost/08d4177a33b8648a71ef

Eduardo Habkost (5):
  target-i386: kvm: Allocate kvm_msrs struct once per VCPU
  target-i386: kvm: Increase MSR_BUF_SIZE
  target-i386: kvm: Simplify MSR array construction
  target-i386: kvm: Simplify MSR setting functions
  target-i386: kvm: Eliminate kvm_msr_entry_set()

 target-i386/cpu-qom.h |   4 +
 target-i386/kvm.c | 322 +++---
 2 files changed, 149 insertions(+), 177 deletions(-)

-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] target-i386: kvm: Allocate kvm_msrs struct once per VCPU

2015-12-16 Thread Eduardo Habkost
Instead of using 2400 bytes in the stack for 150 MSR entries in
kvm_get_msrs() and kvm_put_msrs(), allocate a buffer once for
each VCPU.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/cpu-qom.h |  4 
 target-i386/kvm.c | 37 +++--
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/target-i386/cpu-qom.h b/target-i386/cpu-qom.h
index e3bfe9d..f349b30 100644
--- a/target-i386/cpu-qom.h
+++ b/target-i386/cpu-qom.h
@@ -69,6 +69,8 @@ typedef struct X86CPUClass {
 void (*parent_reset)(CPUState *cpu);
 } X86CPUClass;
 
+struct kvm_msrs;
+
 /**
  * X86CPU:
  * @env: #CPUX86State
@@ -119,6 +121,8 @@ typedef struct X86CPU {
 struct DeviceState *apic_state;
 struct MemoryRegion *cpu_as_root, *cpu_as_mem, *smram;
 Notifier machine_done;
+
+struct kvm_msrs *kvm_msr_buf;
 } X86CPU;
 
 static inline X86CPU *x86_env_get_cpu(CPUX86State *env)
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 6dc9846..660b2d9 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -52,6 +52,9 @@
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+#define MSR_BUF_SIZE \
+(sizeof(struct kvm_msrs) + 150 * sizeof(struct kvm_msr_entry))
+
 #ifndef BUS_MCEERR_AR
 #define BUS_MCEERR_AR 4
 #endif
@@ -841,6 +844,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 if (has_xsave) {
 env->kvm_xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
 }
+cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
 
 if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
 has_msr_mtrr = true;
@@ -1349,6 +1353,11 @@ static void kvm_msr_entry_set(struct kvm_msr_entry 
*entry,
 entry->data = value;
 }
 
+static void kvm_msr_buf_reset(X86CPU *cpu)
+{
+memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
+}
+
 static int kvm_put_tscdeadline_msr(X86CPU *cpu)
 {
 CPUX86State *env = >env;
@@ -1397,13 +1406,11 @@ static int kvm_put_msr_feature_control(X86CPU *cpu)
 static int kvm_put_msrs(X86CPU *cpu, int level)
 {
 CPUX86State *env = >env;
-struct {
-struct kvm_msrs info;
-struct kvm_msr_entry entries[150];
-} msr_data;
-struct kvm_msr_entry *msrs = msr_data.entries;
+struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
 int n = 0, i;
 
+kvm_msr_buf_reset(cpu);
+
 kvm_msr_entry_set([n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
 kvm_msr_entry_set([n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
 kvm_msr_entry_set([n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
@@ -1562,11 +1569,9 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
 }
 }
 
-msr_data.info = (struct kvm_msrs) {
-.nmsrs = n,
-};
+cpu->kvm_msr_buf->nmsrs = n;
 
-return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, _data);
+return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
 
 }
 
@@ -1775,13 +1780,11 @@ static int kvm_get_sregs(X86CPU *cpu)
 static int kvm_get_msrs(X86CPU *cpu)
 {
 CPUX86State *env = >env;
-struct {
-struct kvm_msrs info;
-struct kvm_msr_entry entries[150];
-} msr_data;
-struct kvm_msr_entry *msrs = msr_data.entries;
+struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
 int ret, i, n;
 
+kvm_msr_buf_reset(cpu);
+
 n = 0;
 msrs[n++].index = MSR_IA32_SYSENTER_CS;
 msrs[n++].index = MSR_IA32_SYSENTER_ESP;
@@ -1904,11 +1907,9 @@ static int kvm_get_msrs(X86CPU *cpu)
 }
 }
 
-msr_data.info = (struct kvm_msrs) {
-.nmsrs = n,
-};
+cpu->kvm_msr_buf->nmsrs = n;
 
-ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, _data);
+ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
 if (ret < 0) {
 return ret;
 }
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [Patch V2 1/2] x86, mce: Basic support to add LMCE support to QEMU

2015-12-15 Thread Eduardo Habkost
On Mon, Dec 14, 2015 at 07:17:27PM -0500, Raj, Ashok wrote:
> On Mon, Dec 14, 2015 at 11:37:16PM +0100, Borislav Petkov wrote:
> > On Mon, Dec 14, 2015 at 02:11:46PM -0500, Raj, Ashok wrote:
> > > This is mostly harmless.. since the MCG_CAP space is shared and has no
> > > conflict between vendors. Also just the CAP being set has no effect.
> > 
> > Of course it does - we check SER_P in machine_check_poll() and when
> > I emulate an AMD guest and inject errors into it, error handling is
> > obviously wrong, see:
> > 
> > https://lkml.kernel.org/r/20151123150355.ge5...@pd.tnic
> > 
> 
> I can see how this hurts.. since the poller isn't doing cpu model specific 
> stuff..?
> 
> in the LMCE case, even if you advertise MCG_LMCE_P in MCG_CAP, the guest 
> kernel 
> wont call intel_init_lmce() only from mce_intel.c.. so the same problem
> won't happen.
> 
> but the issue Eduardo mentioned seems like the following.
> 
> New QEMU_LMCE + New KVM_LMCE + New_GUEST_LMCE - No problem
> 
> but if you were to migrage the Guest_LMCE to a non-LMCE supported KVM host
> we could run into an issue.. 
> 
> is this the compatibility issue that you were looking to fix Eduardo?

If I understood you correctly, yes. Also, note that currently
kvm_arch_init_vcpu() simply warns about missing capabilities,
instead of preventing the VM from running/migrating (as it
should). We need to change that, and figure out a good way to
report "feature FOO can't be enabled in this host" errors to
management software[1]. The main problem is that we don't even
have a QMP console available anymore if machine initialization is
aborted.

CCing libvir-list so they get in the loop.

[1] This is similar to what we need for CPUID checks, but the new
MCE feature means we need something more generic (that just
reports QOM property names, probably?)

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [Patch V2 1/2] x86, mce: Basic support to add LMCE support to QEMU

2015-12-14 Thread Eduardo Habkost
Hi,

Comments below:

On Thu, Dec 10, 2015 at 02:41:21PM -0500, Ashok Raj wrote:
> This patch adds basic enumeration, control msr's required to support
> Local Machine Check Exception Support (LMCE).
> 
> - Added Local Machine Check definitions, changed MCG_CAP
> - Added support for IA32_FEATURE_CONTROL.
> - When delivering MCE to guest, we deliver to just a single CPU
>   when guest OS has opted in to Local delivery.
> 
> Signed-off-by: Ashok Raj 
> Tested-by: Gong Chen 
> ---
> Resending with proper commit message for second patch
> 
>  target-i386/cpu.c |  8 
>  target-i386/cpu.h |  8 ++--
>  target-i386/kvm.c | 38 +++---
>  3 files changed, 45 insertions(+), 9 deletions(-)
> 
> diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> index 11e5e39..167669a 100644
> --- a/target-i386/cpu.c
> +++ b/target-i386/cpu.c
> @@ -2737,6 +2737,13 @@ static void mce_init(X86CPU *cpu)
>  }
>  }
>  
> +static void feature_control_init(X86CPU *cpu)
> +{
> + CPUX86State *cenv = >env;
> +
> + cenv->msr_ia32_feature_control = ((1<<20) | (1<<0));

FEATURE_CONTROL_LOCKED and FEATURE_CONTROL_LMCE macros would make
the code clearer.

> +}
> +
>  #ifndef CONFIG_USER_ONLY
>  static void x86_cpu_apic_create(X86CPU *cpu, Error **errp)
>  {
> @@ -2858,6 +2865,7 @@ static void x86_cpu_realizefn(DeviceState *dev, Error 
> **errp)
>  #endif
>  
>  mce_init(cpu);
> +feature_control_init(cpu);
>  
>  #ifndef CONFIG_USER_ONLY
>  if (tcg_enabled()) {
> diff --git a/target-i386/cpu.h b/target-i386/cpu.h
> index 84edfd0..a567d7a 100644
> --- a/target-i386/cpu.h
> +++ b/target-i386/cpu.h
> @@ -282,8 +282,9 @@
>  
>  #define MCG_CTL_P   (1ULL<<8)   /* MCG_CAP register available */
>  #define MCG_SER_P   (1ULL<<24) /* MCA recovery/new status bits */
> +#define MCG_LMCE_P   (1ULL<<27) /* Local Machine Check Supported */

Please use spaces instead of tabs. You can detect this and other
coding style issues in this patch with checkpatch.pl.


>  
> -#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P)
> +#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P|MCG_LMCE_P)

This makes mcg_cap change when upgrading QEMU.

VMs with MCG_LMCE_P enabled shouldn't be migratable to hosts
running older kernels, or the guest may try to read or write
MSR_IA32_MCG_EXT_CTL after miration and get a #GP. That means:

1) Older machine-types (pc-2.5 and older) should keep the
   old (MCG_CTL_P|MCG_SER_P) default.
2) We can't make pc-2.6 enable LMCE by default, either, because
   QEMU guarantees that just changing the machine-type shouldn't
   introduce new host requirements (see:
   http://article.gmane.org/gmane.comp.emulators.qemu/346651)

It looks like we need a new -cpu option to enable the feature,
then. At least until we raise the minimum kernel version
requirements of QEMU.

(I didn't review the kvm_mce_inject() changes as I am not
familiar with the details of how LMCE is implemented.)


>  #define MCE_BANKS_DEF   10
>  
>  #define MCG_CAP_BANKS_MASK 0xff
> @@ -291,6 +292,7 @@
>  #define MCG_STATUS_RIPV (1ULL<<0)   /* restart ip valid */
>  #define MCG_STATUS_EIPV (1ULL<<1)   /* ip points to correct instruction */
>  #define MCG_STATUS_MCIP (1ULL<<2)   /* machine check in progress */
> +#define MCG_STATUS_LMCE (1ULL<<3)   /* Local MCE signaled */
>  
>  #define MCI_STATUS_VAL   (1ULL<<63)  /* valid error */
>  #define MCI_STATUS_OVER  (1ULL<<62)  /* previous errors lost */
> @@ -333,6 +335,7 @@
>  #define MSR_MCG_CAP 0x179
>  #define MSR_MCG_STATUS  0x17a
>  #define MSR_MCG_CTL 0x17b
> +#define MSR_MCG_EXT_CTL  0x4d0
>  
>  #define MSR_P6_EVNTSEL0 0x186
>  
> @@ -892,7 +895,6 @@ typedef struct CPUX86State {
>  
>  uint64_t mcg_status;
>  uint64_t msr_ia32_misc_enable;
> -uint64_t msr_ia32_feature_control;
>  
>  uint64_t msr_fixed_ctr_ctrl;
>  uint64_t msr_global_ctrl;
> @@ -977,8 +979,10 @@ typedef struct CPUX86State {
>  int64_t tsc_khz;
>  void *kvm_xsave_buf;
>  
> +uint64_t msr_ia32_feature_control;
>  uint64_t mcg_cap;
>  uint64_t mcg_ctl;
> +uint64_t mcg_ext_ctl;
>  uint64_t mce_banks[MCE_BANKS_DEF*4];
>  
>  uint64_t tsc_aux;
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 6dc9846..c61fe1f 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -72,6 +72,7 @@ static bool has_msr_tsc_aux;
>  static bool has_msr_tsc_adjust;
>  static bool has_msr_tsc_deadline;
>  static bool has_msr_feature_control;
> +static bool has_msr_ext_mcg_ctl;
>  static bool has_msr_async_pf_en;
>  static bool has_msr_pv_eoi_en;
>  static bool has_msr_misc_enable;
> @@ -370,18 +371,30 @@ static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, 
> int code)
>  uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
>MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
>  uint64_t 

Re: [Qemu-devel] [Patch V2 1/2] x86, mce: Basic support to add LMCE support to QEMU

2015-12-14 Thread Eduardo Habkost
On Mon, Dec 14, 2015 at 02:11:46PM -0500, Raj, Ashok wrote:
> On Mon, Dec 14, 2015 at 05:37:38PM +0100, Borislav Petkov wrote:
> > 
> > ... and obviously LMCE is vendor-specific so it cannot be enabled on
> > !Intel guests with a define like that. mce_init() in qemu should check
> > vendor too.
> > 
> > The same mistake was done with SER_P but that's much harder to change,
> > as we discussed previously.
> > 
> 
> This is mostly harmless.. since the MCG_CAP space is shared and has no
> conflict between vendors. Also just the CAP being set has no effect.
> 
> The Guest OS needs to opt-in and the SIGBUS indicating SRAR are the only 
> ones that are treated special treatment. Also Intel was the only one
> broadcasting MCE's.. so we are like the rest now :-)

If the feature won't be enabled by default, I believe it will be
OK to not make it conditional on CPUID vendor (as we would be
simply doing that the user asked for).

But if it's going to be enabled by default, I would like to get
some assurance that there won't be conflicts between vendors in
the MCG_CAP bits.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 2/3] target-i386: Use xsave structs for ext_save_area

2015-12-04 Thread Eduardo Habkost
This doesn't introduce any change in the code, as the offsets and
struct sizes match what was present in the table. This can be
validated by the QEMU_BUILD_BUG_ON lines on target-i386/cpu.h,
which ensures the struct sizes and offsets match the existing
values in ext_save_area.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
Changes series v1 -> v2
* (none)

Changes series v2 -> v3:
* Added PKRU state
---
 target-i386/cpu.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index a3de18d..c4cfedb 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -477,19 +477,26 @@ typedef struct ExtSaveArea {
 
 static const ExtSaveArea ext_save_areas[] = {
 [2] = { .feature = FEAT_1_ECX, .bits = CPUID_EXT_AVX,
-.offset = 0x240, .size = 0x100 },
+.offset = offsetof(X86XSaveArea, avx_state),
+.size = sizeof(XSaveAVX) },
 [3] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_MPX,
-.offset = 0x3c0, .size = 0x40  },
+.offset = offsetof(X86XSaveArea, bndreg_state),
+.size = sizeof(XSaveBNDREG)  },
 [4] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_MPX,
-.offset = 0x400, .size = 0x40  },
+.offset = offsetof(X86XSaveArea, bndcsr_state),
+.size = sizeof(XSaveBNDCSR)  },
 [5] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_AVX512F,
-.offset = 0x440, .size = 0x40 },
+.offset = offsetof(X86XSaveArea, opmask_state),
+.size = sizeof(XSaveOpmask) },
 [6] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_AVX512F,
-.offset = 0x480, .size = 0x200 },
+.offset = offsetof(X86XSaveArea, zmm_hi256_state),
+.size = sizeof(XSaveZMM_Hi256) },
 [7] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_AVX512F,
-.offset = 0x680, .size = 0x400 },
+.offset = offsetof(X86XSaveArea, hi16_zmm_state),
+.size = sizeof(XSaveHi16_ZMM) },
 [9] = { .feature = FEAT_7_0_ECX, .bits = CPUID_7_0_ECX_PKU,
-.offset = 0xA80, .size = 0x8 },
+.offset = offsetof(X86XSaveArea, pkru_state),
+.size = sizeof(XSavePKRU) },
 };
 
 const char *get_register_name_32(unsigned int reg)
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 0/3] target-i386: Use C struct for xsave area layout, offsets & sizes

2015-12-04 Thread Eduardo Habkost
target-i386/cpu.c:ext_save_area uses magic numbers for the xsave
area offets and sizes, and target-i386/kvm.c:kvm_{put,get}_xsave()
uses offset macros and bit manipulation to access the xsave area.
This series changes both to use C structs for those operations.

I still need to figure out a way to write unit tests for the new
code. Maybe I will just copy and paste the new and old functions,
and test them locally (checking if they give the same results
when translating blobs of random bytes).

Changes v1 -> v2:
* Use uint8_t[8*n] instead of uint64_t[n] for register data
* Keep the QEMU_BUILD_BUG_ON lines

Changes v2 -> v3:
* XMM_Q helper is now ZMM_Q
* Added PKRU state

Eduardo Habkost (3):
  target-i386: Define structs for layout of xsave area
  target-i386: Use xsave structs for ext_save_area
  target-i386: kvm: Use X86XSaveArea struct for xsave save/load

 target-i386/cpu.c |  21 
 target-i386/cpu.h |  95 ++
 target-i386/kvm.c | 101 +-
 3 files changed, 170 insertions(+), 47 deletions(-)

-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 1/3] target-i386: Define structs for layout of xsave area

2015-12-04 Thread Eduardo Habkost
Add structs that define the layout of the xsave areas used by
Intel processors. Add some QEMU_BUILD_BUG_ON lines to ensure the
structs match the XSAVE_* macros in target-i386/kvm.c and the
offsets and sizes at target-i386/cpu.c:ext_save_areas.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
Changes v1 -> v2:
* Use uint8_t[8*n] instead of uint64_t[n] for register data

Changes v2 -> v3:
* Added PKRU state
---
 target-i386/cpu.h | 95 +++
 target-i386/kvm.c | 23 ++
 2 files changed, 118 insertions(+)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 54dfe52..ee58306 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -807,6 +807,101 @@ typedef struct {
 
 #define NB_OPMASK_REGS 8
 
+typedef union X86LegacyXSaveArea {
+struct {
+uint16_t fcw;
+uint16_t fsw;
+uint8_t ftw;
+uint8_t reserved;
+uint16_t fpop;
+uint64_t fpip;
+uint64_t fpdp;
+uint32_t mxcsr;
+uint32_t mxcsr_mask;
+FPReg fpregs[8];
+uint8_t xmm_regs[16][16];
+};
+uint8_t data[512];
+} X86LegacyXSaveArea;
+
+typedef struct X86XSaveHeader {
+uint64_t xstate_bv;
+uint64_t xcomp_bv;
+uint8_t reserved[48];
+} X86XSaveHeader;
+
+/* Ext. save area 2: AVX State */
+typedef struct XSaveAVX {
+uint8_t ymmh[16][16];
+} XSaveAVX;
+
+/* Ext. save area 3: BNDREG */
+typedef struct XSaveBNDREG {
+BNDReg bnd_regs[4];
+} XSaveBNDREG;
+
+/* Ext. save area 4: BNDCSR */
+typedef union XSaveBNDCSR {
+BNDCSReg bndcsr;
+uint8_t data[64];
+} XSaveBNDCSR;
+
+/* Ext. save area 5: Opmask */
+typedef struct XSaveOpmask {
+uint64_t opmask_regs[NB_OPMASK_REGS];
+} XSaveOpmask;
+
+/* Ext. save area 6: ZMM_Hi256 */
+typedef struct XSaveZMM_Hi256 {
+uint8_t zmm_hi256[16][32];
+} XSaveZMM_Hi256;
+
+/* Ext. save area 7: Hi16_ZMM */
+typedef struct XSaveHi16_ZMM {
+uint8_t hi16_zmm[16][64];
+} XSaveHi16_ZMM;
+
+/* Ext. save area 9: PKRU state */
+typedef struct XSavePKRU {
+uint32_t pkru;
+uint32_t padding;
+} XSavePKRU;
+
+typedef struct X86XSaveArea {
+X86LegacyXSaveArea legacy;
+X86XSaveHeader header;
+
+/* Extended save areas: */
+
+/* AVX State: */
+XSaveAVX avx_state;
+uint8_t padding[960-576-sizeof(XSaveAVX)];
+/* MPX State: */
+XSaveBNDREG bndreg_state;
+XSaveBNDCSR bndcsr_state;
+/* AVX-512 State: */
+XSaveOpmask opmask_state;
+XSaveZMM_Hi256 zmm_hi256_state;
+XSaveHi16_ZMM hi16_zmm_state;
+/* PKRU State: */
+XSavePKRU pkru_state;
+} X86XSaveArea;
+
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, avx_state) != 0x240);
+QEMU_BUILD_BUG_ON(sizeof(XSaveAVX) != 0x100);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, bndreg_state) != 0x3c0);
+QEMU_BUILD_BUG_ON(sizeof(XSaveBNDREG) != 0x40);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, bndcsr_state) != 0x400);
+QEMU_BUILD_BUG_ON(sizeof(XSaveBNDCSR) != 0x40);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, opmask_state) != 0x440);
+QEMU_BUILD_BUG_ON(sizeof(XSaveOpmask) != 0x40);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, zmm_hi256_state) != 0x480);
+QEMU_BUILD_BUG_ON(sizeof(XSaveZMM_Hi256) != 0x200);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, hi16_zmm_state) != 0x680);
+QEMU_BUILD_BUG_ON(sizeof(XSaveHi16_ZMM) != 0x400);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, pkru_state) != 0xA80);
+QEMU_BUILD_BUG_ON(sizeof(XSavePKRU) != 0x8);
+
 typedef enum TPRAccess {
 TPR_ACCESS_READ,
 TPR_ACCESS_WRITE,
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 75eb60b..a702b33 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1259,6 +1259,29 @@ static int kvm_put_fpu(X86CPU *cpu)
 #define XSAVE_Hi16_ZMM416
 #define XSAVE_PKRU672
 
+#define XSAVE_BYTE_OFFSET(word_offset) \
+((word_offset)*sizeof(((struct kvm_xsave*)0)->region[0]))
+
+#define ASSERT_OFFSET(word_offset, field) \
+QEMU_BUILD_BUG_ON(XSAVE_BYTE_OFFSET(word_offset) != \
+  offsetof(X86XSaveArea, field))
+
+ASSERT_OFFSET(XSAVE_FCW_FSW, legacy.fcw);
+ASSERT_OFFSET(XSAVE_FTW_FOP, legacy.ftw);
+ASSERT_OFFSET(XSAVE_CWD_RIP, legacy.fpip);
+ASSERT_OFFSET(XSAVE_CWD_RDP, legacy.fpdp);
+ASSERT_OFFSET(XSAVE_MXCSR, legacy.mxcsr);
+ASSERT_OFFSET(XSAVE_ST_SPACE, legacy.fpregs);
+ASSERT_OFFSET(XSAVE_XMM_SPACE, legacy.xmm_regs);
+ASSERT_OFFSET(XSAVE_XSTATE_BV, header.xstate_bv);
+ASSERT_OFFSET(XSAVE_YMMH_SPACE, avx_state);
+ASSERT_OFFSET(XSAVE_BNDREGS, bndreg_state);
+ASSERT_OFFSET(XSAVE_BNDCSR, bndcsr_state);
+ASSERT_OFFSET(XSAVE_OPMASK, opmask_state);
+ASSERT_OFFSET(XSAVE_ZMM_Hi256, zmm_hi256_state);
+ASSERT_OFFSET(XSAVE_Hi16_ZMM, hi16_zmm_state);
+ASSERT_OFFSET(XSAVE_PKRU, pkru_state);
+
 static int kvm_put_xsave(X86CPU *cpu)
 {
 CPUX86State *env = >env;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 3/3] target-i386: kvm: Use X86XSaveArea struct for xsave save/load

2015-12-04 Thread Eduardo Habkost
Instead of using offset macros and bit operations in a uint32_t
array, use the X86XSaveArea struct to perform the loading/saving
operations in kvm_put_xsave() and kvm_get_xsave().

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
Changes v1 -> v2:
* Use uint8_t pointers when loading/saving xmm, ymmh, zmmh,
  keeping the same load/save logic from previous code
* Keep the QEMU_BUILD_BUG_ON lines

Changes v2 -> v3:
* Added PKRU state
* Now the xmm_regs helper macros are named ZMM_Q, no XMM_Q
---
 target-i386/kvm.c | 78 +++
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index a702b33..3cd8e35 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1285,9 +1285,8 @@ ASSERT_OFFSET(XSAVE_PKRU, pkru_state);
 static int kvm_put_xsave(X86CPU *cpu)
 {
 CPUX86State *env = >env;
-struct kvm_xsave* xsave = env->kvm_xsave_buf;
+X86XSaveArea *xsave = env->kvm_xsave_buf;
 uint16_t cwd, swd, twd;
-uint8_t *xmm, *ymmh, *zmmh;
 int i, r;
 
 if (!has_xsave) {
@@ -1302,25 +1301,26 @@ static int kvm_put_xsave(X86CPU *cpu)
 for (i = 0; i < 8; ++i) {
 twd |= (!env->fptags[i]) << i;
 }
-xsave->region[XSAVE_FCW_FSW] = (uint32_t)(swd << 16) + cwd;
-xsave->region[XSAVE_FTW_FOP] = (uint32_t)(env->fpop << 16) + twd;
-memcpy(>region[XSAVE_CWD_RIP], >fpip, sizeof(env->fpip));
-memcpy(>region[XSAVE_CWD_RDP], >fpdp, sizeof(env->fpdp));
-memcpy(>region[XSAVE_ST_SPACE], env->fpregs,
+xsave->legacy.fcw = cwd;
+xsave->legacy.fsw = swd;
+xsave->legacy.ftw = twd;
+xsave->legacy.fpop = env->fpop;
+xsave->legacy.fpip = env->fpip;
+xsave->legacy.fpdp = env->fpdp;
+memcpy(>legacy.fpregs, env->fpregs,
 sizeof env->fpregs);
-xsave->region[XSAVE_MXCSR] = env->mxcsr;
-*(uint64_t *)>region[XSAVE_XSTATE_BV] = env->xstate_bv;
-memcpy(>region[XSAVE_BNDREGS], env->bnd_regs,
+xsave->legacy.mxcsr = env->mxcsr;
+xsave->header.xstate_bv = env->xstate_bv;
+memcpy(>bndreg_state.bnd_regs, env->bnd_regs,
 sizeof env->bnd_regs);
-memcpy(>region[XSAVE_BNDCSR], >bndcs_regs,
-sizeof(env->bndcs_regs));
-memcpy(>region[XSAVE_OPMASK], env->opmask_regs,
+xsave->bndcsr_state.bndcsr = env->bndcs_regs;
+memcpy(>opmask_state.opmask_regs, env->opmask_regs,
 sizeof env->opmask_regs);
 
-xmm = (uint8_t *)>region[XSAVE_XMM_SPACE];
-ymmh = (uint8_t *)>region[XSAVE_YMMH_SPACE];
-zmmh = (uint8_t *)>region[XSAVE_ZMM_Hi256];
-for (i = 0; i < CPU_NB_REGS; i++, xmm += 16, ymmh += 16, zmmh += 32) {
+for (i = 0; i < CPU_NB_REGS; i++) {
+uint8_t *xmm = xsave->legacy.xmm_regs[i];
+uint8_t *ymmh = xsave->avx_state.ymmh[i];
+uint8_t *zmmh = xsave->zmm_hi256_state.zmm_hi256[i];
 stq_p(xmm, env->xmm_regs[i].ZMM_Q(0));
 stq_p(xmm+8,   env->xmm_regs[i].ZMM_Q(1));
 stq_p(ymmh,env->xmm_regs[i].ZMM_Q(2));
@@ -1332,9 +1332,9 @@ static int kvm_put_xsave(X86CPU *cpu)
 }
 
 #ifdef TARGET_X86_64
-memcpy(>region[XSAVE_Hi16_ZMM], >xmm_regs[16],
+memcpy(>hi16_zmm_state.hi16_zmm, >xmm_regs[16],
 16 * sizeof env->xmm_regs[16]);
-memcpy(>region[XSAVE_PKRU], >pkru, sizeof env->pkru);
+memcpy(>pkru_state, >pkru, sizeof env->pkru);
 #endif
 r = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
 return r;
@@ -1669,9 +1669,8 @@ static int kvm_get_fpu(X86CPU *cpu)
 static int kvm_get_xsave(X86CPU *cpu)
 {
 CPUX86State *env = >env;
-struct kvm_xsave* xsave = env->kvm_xsave_buf;
+X86XSaveArea *xsave = env->kvm_xsave_buf;
 int ret, i;
-const uint8_t *xmm, *ymmh, *zmmh;
 uint16_t cwd, swd, twd;
 
 if (!has_xsave) {
@@ -1683,33 +1682,32 @@ static int kvm_get_xsave(X86CPU *cpu)
 return ret;
 }
 
-cwd = (uint16_t)xsave->region[XSAVE_FCW_FSW];
-swd = (uint16_t)(xsave->region[XSAVE_FCW_FSW] >> 16);
-twd = (uint16_t)xsave->region[XSAVE_FTW_FOP];
-env->fpop = (uint16_t)(xsave->region[XSAVE_FTW_FOP] >> 16);
+cwd = xsave->legacy.fcw;
+swd = xsave->legacy.fsw;
+twd = xsave->legacy.ftw;
+env->fpop = xsave->legacy.fpop;
 env->fpstt = (swd >> 11) & 7;
 env->fpus = swd;
 env->fpuc = cwd;
 for (i = 0; i < 8; ++i) {
 env->fptags[i] = !((twd >> i) & 1);
 }
-memcpy(>fpip, >region[XSAVE_CWD_RIP], sizeof(env->fpip));
-memcpy(>fpdp, >region[XSAVE_CWD_RDP], sizeof(env->fpdp));
-env->mxcsr = xsave->region[XSAVE_MXCSR];
-memcpy(env->fpregs, >region[XSAVE_ST_SPACE

Re: [Qemu-devel] [PATCH v3 0/3] target-i386: add memory protection-key support

2015-12-04 Thread Eduardo Habkost
On Wed, Nov 18, 2015 at 10:20:14AM +0800, Huaitong Han wrote:
> Changes in v3:
> *Fix cpuid_7_0_ecx_feature_name error.
> 
> Changes in v2:
> *Fix memcpy error for xsave state.
> *Fix TCG_7_0_ECX_FEATURES to 0.
> *Make subjects more readable.
> 
> The protection-key feature provides an additional mechanism by which IA-32e
> paging controls access to usermode addresses.
> 
> Hardware support for protection keys for user pages is enumerated with CPUID
> feature flag CPUID.7.0.ECX[3]:PKU. Software support is CPUID.7.0.ECX[4]:OSPKE
> with the setting of CR4.PKE(bit 22).
> 
> The PKRU register is XSAVE-managed state CPUID.D.0.EAX[9], the size of XSAVE
> state component for PKRU is 8 bytes, the offset is 0xa80.
> 
> The specification of Protection Keys can be found at SDM (4.6.2, volume 3)
> http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf.

Reviewed-by: Eduardo Habkost <ehabk...@redhat.com>

The patches were squashed together and queued in x86-next branch
for 2.6.

> 
> Huaitong Han (3):
>   target-i386: add pkeys support for cpuid handling
>   target-i386: add pkeys support for xsave state handling
>   target-i386: add pkeys support for vm migration
> 
>  target-i386/cpu.c | 23 ++-
>  target-i386/cpu.h |  7 +++
>  target-i386/kvm.c |  3 +++
>  target-i386/machine.c | 23 +++
>  4 files changed, 55 insertions(+), 1 deletion(-)
> 
> -- 
> 2.4.3
> 
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [for-2.6 PATCH 1/3] target-i386: Define structs for layout of xsave area

2015-12-01 Thread Eduardo Habkost
On Tue, Dec 01, 2015 at 09:09:47AM -0800, Richard Henderson wrote:
> On 11/30/2015 03:18 AM, Paolo Bonzini wrote:
> >Because this is always little endian, I would write it as uint8_t[16][16].
> 
> Maybe.  That isn't altogether handy for TCG, since we'll be wanting to bswap
> these buffers (probably in uint64_t chunks).

X86XSaveArea will be used only when loading/saving state using
xsave, not for executing regular instructions. In X86CPU, the
data is already stored as XMMReg unions (the one with the
XMM_[BWDQ] helpers).

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [for-2.6 PATCH 1/3] target-i386: Define structs for layout of xsave area

2015-12-01 Thread Eduardo Habkost
On Tue, Dec 01, 2015 at 06:27:17PM +0100, Paolo Bonzini wrote:
> On 01/12/2015 18:20, Richard Henderson wrote:
> >>
> >> X86XSaveArea will be used only when loading/saving state using
> >> xsave, not for executing regular instructions.
> > 
> > ... like the regular instruction xsave?
> > 
> > https://patchwork.ozlabs.org/patch/493318/
> 
> Right, but that's a helper anyway.
> 
> >> In X86CPU, the
> >> data is already stored as XMMReg unions (the one with the
> >> XMM_[BWDQ] helpers).
> > 
> > Of course.  But those unions are arranged to be in big-endian format on
> > big-endian hosts.  So we need to swap the data back to little-endian
> > format for storage into guest memory.
> 
> Yes, you can use byte moves with XMM_B (more obvious), or stq_le_p with
> XMM_Q (faster I guess---though the compiler might optimize the former on
> little-endian hosts).  Either works with an uint8_t[] destination.

stq_le_p() (more exactly, stq_p()) is exactly what is already
done by kvm_{get,put}_xsave(), using uint8_t pointers.

BTW, if we are going to implement xsave in TCG, the
X86CPU<->xsave translation logic in kvm_{get,put}_xsave() could
be moved to generic code and reused by TCG instead of being
reimplemented.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 0/3] target-i386: Use C struct for xsave area layout, offsets & sizes

2015-12-01 Thread Eduardo Habkost
On Tue, Dec 01, 2015 at 11:22:31AM +0100, Paolo Bonzini wrote:
> On 30/11/2015 18:34, Eduardo Habkost wrote:
> > target-i386/cpu.c:ext_save_area uses magic numbers for the xsave
> > area offets and sizes, and target-i386/kvm.c:kvm_{put,get}_xsave()
> > uses offset macros and bit manipulation to access the xsave area.
> > This series changes both to use C structs for those operations.
> > 
> > I still need to figure out a way to write unit tests for the new
> > code. Maybe I will just copy and paste the new and old functions,
> > and test them locally (checking if they give the same results
> > when translating blobs of random bytes).
> > 
> > Changes v1 -> v2:
> > * Use uint8_t[8*n] instead of uint64_t[n] for register data
> > * Keep the QEMU_BUILD_BUG_ON lines
> > 
[...]
> > 
> > Eduardo Habkost (3):
> >   target-i386: Define structs for layout of xsave area
> >   target-i386: Use xsave structs for ext_save_area
> >   target-i386: kvm: Use X86XSaveArea struct for xsave save/load
> > 
> >  target-i386/cpu.c | 18 +++
> >  target-i386/cpu.h | 85 
> >  target-i386/kvm.c | 96 
> > +--
> >  3 files changed, 155 insertions(+), 44 deletions(-)
> > 
> 
> The patches are okay, are you going to rebase them on top of the PKRU
> patches?

I will probably redo the PKRU patches on top of this, to reduce
diff size.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 0/3] target-i386: Use C struct for xsave area layout, offsets & sizes

2015-12-01 Thread Eduardo Habkost
On Tue, Dec 01, 2015 at 04:09:44PM +0100, Paolo Bonzini wrote:
> 
> 
> On 30/11/2015 18:34, Eduardo Habkost wrote:
> > target-i386/cpu.c:ext_save_area uses magic numbers for the xsave
> > area offets and sizes, and target-i386/kvm.c:kvm_{put,get}_xsave()
> > uses offset macros and bit manipulation to access the xsave area.
> > This series changes both to use C structs for those operations.
> > 
> > I still need to figure out a way to write unit tests for the new
> > code. Maybe I will just copy and paste the new and old functions,
> > and test them locally (checking if they give the same results
> > when translating blobs of random bytes).
> 
> I think it's easier to use small guests (i.e. kvm-unit-tests) to test
> this code.

I agree it's easier, but how likely it is to catch bugs in the
save/load code? If the code corrupts a register, we need to
trigger a save/load cycle at the exact moment the guest code is
using that register. Do we have something that helps us
repeatedly save/load CPU state while kvm-unit-tests is running?

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/3] target-i386: Use C struct for xsave area layout, offsets & sizes

2015-11-30 Thread Eduardo Habkost
fdef TARGET_X86_64
  @@ -1625,17 +1662,17 @@ static int kvm_get_xsave(X86CPU *cpu)
   sizeof env->opmask_regs);

   for (i = 0; i < CPU_NB_REGS; i++) {
  -X86LegacyXSaveArea *legacy = >legacy;
  -XSaveAVX *avx = >avx_state;
  -XSaveZMM_Hi256 *zmm_hi256 = >zmm_hi256_state;
  -env->xmm_regs[i].XMM_Q(0) = ldq_p(>xmm_regs[i][0]);
  -env->xmm_regs[i].XMM_Q(1) = ldq_p(>xmm_regs[i][1]);
  -env->xmm_regs[i].XMM_Q(2) = ldq_p(>ymmh[i][0]);
  -env->xmm_regs[i].XMM_Q(3) = ldq_p(>ymmh[i][1]);
  -env->xmm_regs[i].XMM_Q(4) = ldq_p(_hi256->zmm_hi256[i][0]);
  -env->xmm_regs[i].XMM_Q(5) = ldq_p(_hi256->zmm_hi256[i][1]);
  -env->xmm_regs[i].XMM_Q(6) = ldq_p(_hi256->zmm_hi256[i][2]);
  -env->xmm_regs[i].XMM_Q(7) = ldq_p(_hi256->zmm_hi256[i][3]);
  +uint8_t *xmm = xsave->legacy.xmm_regs[i];
  +uint8_t *ymmh = xsave->avx_state.ymmh[i];
  +uint8_t *zmmh = xsave->zmm_hi256_state.zmm_hi256[i];
  +env->xmm_regs[i].XMM_Q(0) = ldq_p(xmm);
  +env->xmm_regs[i].XMM_Q(1) = ldq_p(xmm+8);
  +env->xmm_regs[i].XMM_Q(2) = ldq_p(ymmh);
  +env->xmm_regs[i].XMM_Q(3) = ldq_p(ymmh+8);
  +env->xmm_regs[i].XMM_Q(4) = ldq_p(zmmh);
  +    env->xmm_regs[i].XMM_Q(5) = ldq_p(zmmh+8);
  +env->xmm_regs[i].XMM_Q(6) = ldq_p(zmmh+16);
  +env->xmm_regs[i].XMM_Q(7) = ldq_p(zmmh+24);
   }

   #ifdef TARGET_X86_64

Eduardo Habkost (3):
  target-i386: Define structs for layout of xsave area
  target-i386: Use xsave structs for ext_save_area
  target-i386: kvm: Use X86XSaveArea struct for xsave save/load

 target-i386/cpu.c | 18 +++
 target-i386/cpu.h | 85 
 target-i386/kvm.c | 96 +--
 3 files changed, 155 insertions(+), 44 deletions(-)

-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/3] target-i386: kvm: Use X86XSaveArea struct for xsave save/load

2015-11-30 Thread Eduardo Habkost
Instead of using offset macros and bit operations in a uint32_t
array, use the X86XSaveArea struct to perform the loading/saving
operations in kvm_put_xsave() and kvm_get_xsave().

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
Changes v1 -> v2:
* Use uint8_t pointers when loading/saving xmm, ymmh, zmmh,
  keeping the same load/save logic from previous code
* Keep the QEMU_BUILD_BUG_ON lines
---
 target-i386/kvm.c | 74 +++
 1 file changed, 36 insertions(+), 38 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index b8b336b..98249e4 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1243,9 +1243,8 @@ ASSERT_OFFSET(XSAVE_Hi16_ZMM, hi16_zmm_state);
 static int kvm_put_xsave(X86CPU *cpu)
 {
 CPUX86State *env = >env;
-struct kvm_xsave* xsave = env->kvm_xsave_buf;
+X86XSaveArea *xsave = env->kvm_xsave_buf;
 uint16_t cwd, swd, twd;
-uint8_t *xmm, *ymmh, *zmmh;
 int i, r;
 
 if (!has_xsave) {
@@ -1260,25 +1259,26 @@ static int kvm_put_xsave(X86CPU *cpu)
 for (i = 0; i < 8; ++i) {
 twd |= (!env->fptags[i]) << i;
 }
-xsave->region[XSAVE_FCW_FSW] = (uint32_t)(swd << 16) + cwd;
-xsave->region[XSAVE_FTW_FOP] = (uint32_t)(env->fpop << 16) + twd;
-memcpy(>region[XSAVE_CWD_RIP], >fpip, sizeof(env->fpip));
-memcpy(>region[XSAVE_CWD_RDP], >fpdp, sizeof(env->fpdp));
-memcpy(>region[XSAVE_ST_SPACE], env->fpregs,
+xsave->legacy.fcw = cwd;
+xsave->legacy.fsw = swd;
+xsave->legacy.ftw = twd;
+xsave->legacy.fpop = env->fpop;
+xsave->legacy.fpip = env->fpip;
+xsave->legacy.fpdp = env->fpdp;
+memcpy(>legacy.fpregs, env->fpregs,
 sizeof env->fpregs);
-xsave->region[XSAVE_MXCSR] = env->mxcsr;
-*(uint64_t *)>region[XSAVE_XSTATE_BV] = env->xstate_bv;
-memcpy(>region[XSAVE_BNDREGS], env->bnd_regs,
+xsave->legacy.mxcsr = env->mxcsr;
+xsave->header.xstate_bv = env->xstate_bv;
+memcpy(>bndreg_state.bnd_regs, env->bnd_regs,
 sizeof env->bnd_regs);
-memcpy(>region[XSAVE_BNDCSR], >bndcs_regs,
-sizeof(env->bndcs_regs));
-memcpy(>region[XSAVE_OPMASK], env->opmask_regs,
+xsave->bndcsr_state.bndcsr = env->bndcs_regs;
+memcpy(>opmask_state.opmask_regs, env->opmask_regs,
 sizeof env->opmask_regs);
 
-xmm = (uint8_t *)>region[XSAVE_XMM_SPACE];
-ymmh = (uint8_t *)>region[XSAVE_YMMH_SPACE];
-zmmh = (uint8_t *)>region[XSAVE_ZMM_Hi256];
-for (i = 0; i < CPU_NB_REGS; i++, xmm += 16, ymmh += 16, zmmh += 32) {
+for (i = 0; i < CPU_NB_REGS; i++) {
+uint8_t *xmm = xsave->legacy.xmm_regs[i];
+uint8_t *ymmh = xsave->avx_state.ymmh[i];
+uint8_t *zmmh = xsave->zmm_hi256_state.zmm_hi256[i];
 stq_p(xmm, env->xmm_regs[i].XMM_Q(0));
 stq_p(xmm+8,   env->xmm_regs[i].XMM_Q(1));
 stq_p(ymmh,env->xmm_regs[i].XMM_Q(2));
@@ -1290,7 +1290,7 @@ static int kvm_put_xsave(X86CPU *cpu)
 }
 
 #ifdef TARGET_X86_64
-memcpy(>region[XSAVE_Hi16_ZMM], >xmm_regs[16],
+memcpy(>hi16_zmm_state.hi16_zmm, >xmm_regs[16],
 16 * sizeof env->xmm_regs[16]);
 #endif
 r = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
@@ -1626,9 +1626,8 @@ static int kvm_get_fpu(X86CPU *cpu)
 static int kvm_get_xsave(X86CPU *cpu)
 {
 CPUX86State *env = >env;
-struct kvm_xsave* xsave = env->kvm_xsave_buf;
+X86XSaveArea *xsave = env->kvm_xsave_buf;
 int ret, i;
-const uint8_t *xmm, *ymmh, *zmmh;
 uint16_t cwd, swd, twd;
 
 if (!has_xsave) {
@@ -1640,33 +1639,32 @@ static int kvm_get_xsave(X86CPU *cpu)
 return ret;
 }
 
-cwd = (uint16_t)xsave->region[XSAVE_FCW_FSW];
-swd = (uint16_t)(xsave->region[XSAVE_FCW_FSW] >> 16);
-twd = (uint16_t)xsave->region[XSAVE_FTW_FOP];
-env->fpop = (uint16_t)(xsave->region[XSAVE_FTW_FOP] >> 16);
+cwd = xsave->legacy.fcw;
+swd = xsave->legacy.fsw;
+twd = xsave->legacy.ftw;
+env->fpop = xsave->legacy.fpop;
 env->fpstt = (swd >> 11) & 7;
 env->fpus = swd;
 env->fpuc = cwd;
 for (i = 0; i < 8; ++i) {
 env->fptags[i] = !((twd >> i) & 1);
 }
-memcpy(>fpip, >region[XSAVE_CWD_RIP], sizeof(env->fpip));
-memcpy(>fpdp, >region[XSAVE_CWD_RDP], sizeof(env->fpdp));
-env->mxcsr = xsave->region[XSAVE_MXCSR];
-memcpy(env->fpregs, >region[XSAVE_ST_SPACE],
+env->fpip = xsave->legacy.fpip;
+env->fpdp = xsave->legacy.fpdp;
+env->mxcsr = xsave->legacy.mxcsr;
+memcpy(env->fpregs, >legacy.fpregs,
 sizeof env->fpregs);
-env->xstate_bv 

[PATCH v2 1/3] target-i386: Define structs for layout of xsave area

2015-11-30 Thread Eduardo Habkost
Add structs that define the layout of the xsave areas used by
Intel processors. Add some QEMU_BUILD_BUG_ON lines to ensure the
structs match the XSAVE_* macros in target-i386/kvm.c and the
offsets and sizes at target-i386/cpu.c:ext_save_areas.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
Changes v1 -> v2:
* Use uint8_t[8*n] instead of uint64_t[n] for register data
---
 target-i386/cpu.h | 85 +++
 target-i386/kvm.c | 22 ++
 2 files changed, 107 insertions(+)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 84edfd0..41f55ef 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -806,6 +806,91 @@ typedef struct {
 
 #define NB_OPMASK_REGS 8
 
+typedef union X86LegacyXSaveArea {
+struct {
+uint16_t fcw;
+uint16_t fsw;
+uint8_t ftw;
+uint8_t reserved;
+uint16_t fpop;
+uint64_t fpip;
+uint64_t fpdp;
+uint32_t mxcsr;
+uint32_t mxcsr_mask;
+FPReg fpregs[8];
+uint8_t xmm_regs[16][16];
+};
+uint8_t data[512];
+} X86LegacyXSaveArea;
+
+typedef struct X86XSaveHeader {
+uint64_t xstate_bv;
+uint64_t xcomp_bv;
+uint8_t reserved[48];
+} X86XSaveHeader;
+
+/* Ext. save area 2: AVX State */
+typedef struct XSaveAVX {
+uint8_t ymmh[16][16];
+} XSaveAVX;
+
+/* Ext. save area 3: BNDREG */
+typedef struct XSaveBNDREG {
+BNDReg bnd_regs[4];
+} XSaveBNDREG;
+
+/* Ext. save area 4: BNDCSR */
+typedef union XSaveBNDCSR {
+BNDCSReg bndcsr;
+uint8_t data[64];
+} XSaveBNDCSR;
+
+/* Ext. save area 5: Opmask */
+typedef struct XSaveOpmask {
+uint64_t opmask_regs[NB_OPMASK_REGS];
+} XSaveOpmask;
+
+/* Ext. save area 6: ZMM_Hi256 */
+typedef struct XSaveZMM_Hi256 {
+uint8_t zmm_hi256[16][32];
+} XSaveZMM_Hi256;
+
+/* Ext. save area 7: Hi16_ZMM */
+typedef struct XSaveHi16_ZMM {
+uint8_t hi16_zmm[16][64];
+} XSaveHi16_ZMM;
+
+typedef struct X86XSaveArea {
+X86LegacyXSaveArea legacy;
+X86XSaveHeader header;
+
+/* Extended save areas: */
+
+/* AVX State: */
+XSaveAVX avx_state;
+uint8_t padding[960-576-sizeof(XSaveAVX)];
+/* MPX State: */
+XSaveBNDREG bndreg_state;
+XSaveBNDCSR bndcsr_state;
+/* AVX-512 State: */
+XSaveOpmask opmask_state;
+XSaveZMM_Hi256 zmm_hi256_state;
+XSaveHi16_ZMM hi16_zmm_state;
+} X86XSaveArea;
+
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, avx_state) != 0x240);
+QEMU_BUILD_BUG_ON(sizeof(XSaveAVX) != 0x100);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, bndreg_state) != 0x3c0);
+QEMU_BUILD_BUG_ON(sizeof(XSaveBNDREG) != 0x40);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, bndcsr_state) != 0x400);
+QEMU_BUILD_BUG_ON(sizeof(XSaveBNDCSR) != 0x40);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, opmask_state) != 0x440);
+QEMU_BUILD_BUG_ON(sizeof(XSaveOpmask) != 0x40);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, zmm_hi256_state) != 0x480);
+QEMU_BUILD_BUG_ON(sizeof(XSaveZMM_Hi256) != 0x200);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, hi16_zmm_state) != 0x680);
+QEMU_BUILD_BUG_ON(sizeof(XSaveHi16_ZMM) != 0x400);
+
 typedef enum TPRAccess {
 TPR_ACCESS_READ,
 TPR_ACCESS_WRITE,
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 6dc9846..b8b336b 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1218,6 +1218,28 @@ static int kvm_put_fpu(X86CPU *cpu)
 #define XSAVE_ZMM_Hi256   288
 #define XSAVE_Hi16_ZMM416
 
+#define XSAVE_BYTE_OFFSET(word_offset) \
+((word_offset)*sizeof(((struct kvm_xsave*)0)->region[0]))
+
+#define ASSERT_OFFSET(word_offset, field) \
+QEMU_BUILD_BUG_ON(XSAVE_BYTE_OFFSET(word_offset) != \
+  offsetof(X86XSaveArea, field))
+
+ASSERT_OFFSET(XSAVE_FCW_FSW, legacy.fcw);
+ASSERT_OFFSET(XSAVE_FTW_FOP, legacy.ftw);
+ASSERT_OFFSET(XSAVE_CWD_RIP, legacy.fpip);
+ASSERT_OFFSET(XSAVE_CWD_RDP, legacy.fpdp);
+ASSERT_OFFSET(XSAVE_MXCSR, legacy.mxcsr);
+ASSERT_OFFSET(XSAVE_ST_SPACE, legacy.fpregs);
+ASSERT_OFFSET(XSAVE_XMM_SPACE, legacy.xmm_regs);
+ASSERT_OFFSET(XSAVE_XSTATE_BV, header.xstate_bv);
+ASSERT_OFFSET(XSAVE_YMMH_SPACE, avx_state);
+ASSERT_OFFSET(XSAVE_BNDREGS, bndreg_state);
+ASSERT_OFFSET(XSAVE_BNDCSR, bndcsr_state);
+ASSERT_OFFSET(XSAVE_OPMASK, opmask_state);
+ASSERT_OFFSET(XSAVE_ZMM_Hi256, zmm_hi256_state);
+ASSERT_OFFSET(XSAVE_Hi16_ZMM, hi16_zmm_state);
+
 static int kvm_put_xsave(X86CPU *cpu)
 {
 CPUX86State *env = >env;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/3] target-i386: Use xsave structs for ext_save_area

2015-11-30 Thread Eduardo Habkost
This doesn't introduce any change in the code, as the offsets and
struct sizes match what was present in the table. This can be
validated by the QEMU_BUILD_BUG_ON lines on target-i386/cpu.h,
which ensures the struct sizes and offsets match the existing
values in ext_save_area.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/cpu.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 11e5e39..bc95437 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -458,17 +458,23 @@ typedef struct ExtSaveArea {
 
 static const ExtSaveArea ext_save_areas[] = {
 [2] = { .feature = FEAT_1_ECX, .bits = CPUID_EXT_AVX,
-.offset = 0x240, .size = 0x100 },
+.offset = offsetof(X86XSaveArea, avx_state),
+.size = sizeof(XSaveAVX) },
 [3] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_MPX,
-.offset = 0x3c0, .size = 0x40  },
+.offset = offsetof(X86XSaveArea, bndreg_state),
+.size = sizeof(XSaveBNDREG)  },
 [4] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_MPX,
-.offset = 0x400, .size = 0x40  },
+.offset = offsetof(X86XSaveArea, bndcsr_state),
+.size = sizeof(XSaveBNDCSR)  },
 [5] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_AVX512F,
-.offset = 0x440, .size = 0x40 },
+.offset = offsetof(X86XSaveArea, opmask_state),
+.size = sizeof(XSaveOpmask) },
 [6] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_AVX512F,
-.offset = 0x480, .size = 0x200 },
+.offset = offsetof(X86XSaveArea, zmm_hi256_state),
+.size = sizeof(XSaveZMM_Hi256) },
 [7] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_AVX512F,
-.offset = 0x680, .size = 0x400 },
+.offset = offsetof(X86XSaveArea, hi16_zmm_state),
+.size = sizeof(XSaveHi16_ZMM) },
 };
 
 const char *get_register_name_32(unsigned int reg)
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [for-2.6 PATCH 0/3] target-i386: Use C struct for xsave area layout, offsets & sizes

2015-11-30 Thread Eduardo Habkost
On Mon, Nov 30, 2015 at 12:21:23PM +0100, Paolo Bonzini wrote:
> 
> 
> On 28/11/2015 20:56, Eduardo Habkost wrote:
> > I still need to figure out a way to write unit tests for the new
> > code. Maybe I will just copy and paste the new and old functions,
> > and test them locally (checking if they give the same results
> > when translating blobs of random bytes).
> 
> Aren't the QEMU_BUILD_BUG_ON enough?  No need to delete them in patch 3,
> though perhaps you can remove the #defines.

Just wanted to be 100% sure. Even if the offets are all correct,
I might have made other mistakes when translating the get/save
code.

About the QEMU_BUILD_BUG_ON lines, we can keep them if you like.
We could translate the uint32_t offsets to byte offsets after
ptach 3/3, to make them easier to compare to the Intel docs.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [for-2.6 PATCH 1/3] target-i386: Define structs for layout of xsave area

2015-11-30 Thread Eduardo Habkost
On Mon, Nov 30, 2015 at 12:18:33PM +0100, Paolo Bonzini wrote:
> On 28/11/2015 20:56, Eduardo Habkost wrote:
> > +/* Ext. save area 2: AVX State */
> > +typedef struct XSaveAVX {
> > +uint64_t ymmh[16][2];
> > +} XSaveAVX;
> > +
> 
> Because this is always little endian, I would write it as uint8_t[16][16].

The main reason I used uint64_t was to allow the put/save code to
simply use field[0], field[1], field[2] instead of field,
field+8, field+16. But I think your suggestion makes sense: the
fact that we load/save the register data in 8-byte chunks in
kvm_{get,put}_xsave() has nothing to do with the layout of the
data itself (so it doesn't need to be encoded in the struct
definition).

> 
> > +/* Ext. save area 6: ZMM_Hi256 */
> > +typedef struct XSaveZMM_Hi256 {
> > +uint64_t zmm_hi256[16][4];
> > +} XSaveZMM_Hi256;
> 
> Same here (uint8_t[16][32]).
> 
> > +/* Ext. save area 7: Hi16_ZMM */
> > +typedef struct XSaveHi16_ZMM {
> > +XMMReg hi16_zmm[16];
> > +} XSaveHi16_ZMM;
> 
> This is uint8_t[16][64] or uint64_t[16][8].

While writing this series, I had a version that redefined the
XMMReg, YMMReg, ZMMReg structs with the correct sizes, and reused
them for ymmh, zmm_hi256, and hi16_zmm. I still planned to
propose that later. You don't think it would make sense?

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v6 3/3] target-i386: add support to migrate vcpu's TSC rate

2015-11-28 Thread Eduardo Habkost
On Fri, Nov 27, 2015 at 08:16:42AM +0800, Haozhong Zhang wrote:
> On 11/26/15 12:19, Eduardo Habkost wrote:
> > On Tue, Nov 24, 2015 at 11:33:57AM +0800, Haozhong Zhang wrote:
> > > This patch enables migrating vcpu's TSC rate. If KVM on the destination
> > > machine supports TSC scaling, guest programs will observe a consistent
> > > TSC rate across the migration.
> > > 
> > > If TSC scaling is not supported on the destination machine, the
> > > migration will not be aborted and QEMU on the destination will not set
> > > vcpu's TSC rate to the migrated value.
> > > 
> > > If vcpu's TSC rate specified by CPU option 'tsc-freq' on the destination
> > > machine is inconsistent with the migrated TSC rate, the migration will
> > > be aborted.
> > > 
> > > For backwards compatibility, the migration of vcpu's TSC rate is
> > > disabled on pc-*-2.4 and older machine types.
> > > 
> > > Signed-off-by: Haozhong Zhang <haozhong.zh...@intel.com>
> > 
> > Assuming the PC compat code will be moved to
> > pc_*_2_5_machine_options(), because the patch will be included
> > after QEMU 2.5.0:
> >
> > Reviewed-by: Eduardo Habkost <ehabk...@redhat.com>
> > 
> > One comment below:
> 
> Hi Eduardo,
> 
> Thank you for reviewing!
> 
> Besides the comment, should I submit a new version which updates the
> compat code after pc-*-2.6 machine types are added?

There's no need to resubmit. I have queued the patches for 2.6,
in the branch at:

  git://github.com/ehabkost/qemu.git x86-next

The pc-2.6 series is in that queue because it is a dependency.
But I plan to rebase and submit a pull request containing only
the x86-specific patches, after the pc-2.6 series is merged
through Michael's tree.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[for-2.6 PATCH 3/3] target-i386: kvm: Use X86XSaveArea struct for xsave save/load

2015-11-28 Thread Eduardo Habkost
Instead of using offset macros and bit operations in a uint32_t
array, use the X86XSaveArea struct to perform the loading/saving
operations in kvm_put_xsave() and kvm_get_xsave().

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/kvm.c | 144 --
 1 file changed, 52 insertions(+), 92 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index ee6c213..5e7ec70 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1203,50 +1203,11 @@ static int kvm_put_fpu(X86CPU *cpu)
 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, );
 }
 
-#define XSAVE_FCW_FSW 0
-#define XSAVE_FTW_FOP 1
-#define XSAVE_CWD_RIP 2
-#define XSAVE_CWD_RDP 4
-#define XSAVE_MXCSR   6
-#define XSAVE_ST_SPACE8
-#define XSAVE_XMM_SPACE   40
-#define XSAVE_XSTATE_BV   128
-#define XSAVE_YMMH_SPACE  144
-#define XSAVE_BNDREGS 240
-#define XSAVE_BNDCSR  256
-#define XSAVE_OPMASK  272
-#define XSAVE_ZMM_Hi256   288
-#define XSAVE_Hi16_ZMM416
-
-#define XSAVE_BYTE_OFFSET(word_offset) \
-((word_offset)*sizeof(((struct kvm_xsave*)0)->region[0]))
-
-#define ASSERT_OFFSET(word_offset, field) \
-QEMU_BUILD_BUG_ON(XSAVE_BYTE_OFFSET(word_offset) != \
-  offsetof(X86XSaveArea, field))
-
-ASSERT_OFFSET(XSAVE_FCW_FSW, legacy.fcw);
-ASSERT_OFFSET(XSAVE_FTW_FOP, legacy.ftw);
-ASSERT_OFFSET(XSAVE_CWD_RIP, legacy.fpip);
-ASSERT_OFFSET(XSAVE_CWD_RDP, legacy.fpdp);
-ASSERT_OFFSET(XSAVE_MXCSR, legacy.mxcsr);
-ASSERT_OFFSET(XSAVE_ST_SPACE, legacy.fpregs);
-ASSERT_OFFSET(XSAVE_XMM_SPACE, legacy.xmm_regs);
-ASSERT_OFFSET(XSAVE_XSTATE_BV, header.xstate_bv);
-ASSERT_OFFSET(XSAVE_YMMH_SPACE, avx_state);
-ASSERT_OFFSET(XSAVE_BNDREGS, bndreg_state);
-ASSERT_OFFSET(XSAVE_BNDCSR, bndcsr_state);
-ASSERT_OFFSET(XSAVE_OPMASK, opmask_state);
-ASSERT_OFFSET(XSAVE_ZMM_Hi256, zmm_hi256_state);
-ASSERT_OFFSET(XSAVE_Hi16_ZMM, hi16_zmm_state);
-
-
 static int kvm_put_xsave(X86CPU *cpu)
 {
 CPUX86State *env = >env;
-struct kvm_xsave* xsave = env->kvm_xsave_buf;
+X86XSaveArea *xsave = env->kvm_xsave_buf;
 uint16_t cwd, swd, twd;
-uint8_t *xmm, *ymmh, *zmmh;
 int i, r;
 
 if (!has_xsave) {
@@ -1261,37 +1222,38 @@ static int kvm_put_xsave(X86CPU *cpu)
 for (i = 0; i < 8; ++i) {
 twd |= (!env->fptags[i]) << i;
 }
-xsave->region[XSAVE_FCW_FSW] = (uint32_t)(swd << 16) + cwd;
-xsave->region[XSAVE_FTW_FOP] = (uint32_t)(env->fpop << 16) + twd;
-memcpy(>region[XSAVE_CWD_RIP], >fpip, sizeof(env->fpip));
-memcpy(>region[XSAVE_CWD_RDP], >fpdp, sizeof(env->fpdp));
-memcpy(>region[XSAVE_ST_SPACE], env->fpregs,
+xsave->legacy.fcw = cwd;
+xsave->legacy.fsw = swd;
+xsave->legacy.ftw = twd;
+xsave->legacy.fpop = env->fpop;
+xsave->legacy.fpip = env->fpip;
+xsave->legacy.fpdp = env->fpdp;
+memcpy(>legacy.fpregs, env->fpregs,
 sizeof env->fpregs);
-xsave->region[XSAVE_MXCSR] = env->mxcsr;
-*(uint64_t *)>region[XSAVE_XSTATE_BV] = env->xstate_bv;
-memcpy(>region[XSAVE_BNDREGS], env->bnd_regs,
+xsave->legacy.mxcsr = env->mxcsr;
+xsave->header.xstate_bv = env->xstate_bv;
+memcpy(>bndreg_state.bnd_regs, env->bnd_regs,
 sizeof env->bnd_regs);
-memcpy(>region[XSAVE_BNDCSR], >bndcs_regs,
-sizeof(env->bndcs_regs));
-memcpy(>region[XSAVE_OPMASK], env->opmask_regs,
+xsave->bndcsr_state.bndcsr = env->bndcs_regs;
+memcpy(>opmask_state.opmask_regs, env->opmask_regs,
 sizeof env->opmask_regs);
 
-xmm = (uint8_t *)>region[XSAVE_XMM_SPACE];
-ymmh = (uint8_t *)>region[XSAVE_YMMH_SPACE];
-zmmh = (uint8_t *)>region[XSAVE_ZMM_Hi256];
-for (i = 0; i < CPU_NB_REGS; i++, xmm += 16, ymmh += 16, zmmh += 32) {
-stq_p(xmm, env->xmm_regs[i].XMM_Q(0));
-stq_p(xmm+8,   env->xmm_regs[i].XMM_Q(1));
-stq_p(ymmh,env->xmm_regs[i].XMM_Q(2));
-stq_p(ymmh+8,  env->xmm_regs[i].XMM_Q(3));
-stq_p(zmmh,env->xmm_regs[i].XMM_Q(4));
-stq_p(zmmh+8,  env->xmm_regs[i].XMM_Q(5));
-stq_p(zmmh+16, env->xmm_regs[i].XMM_Q(6));
-stq_p(zmmh+24, env->xmm_regs[i].XMM_Q(7));
+for (i = 0; i < CPU_NB_REGS; i++) {
+X86LegacyXSaveArea *legacy = >legacy;
+XSaveAVX *avx = >avx_state;
+XSaveZMM_Hi256 *zmm_hi256 = >zmm_hi256_state;
+stq_p(>xmm_regs[i][0], env->xmm_regs[i].XMM_Q(0));
+stq_p(>xmm_regs[i][1], env->xmm_regs[i].XMM_Q(1));
+stq_p(>ymmh[i][0],env->xmm_regs[i].XMM_Q(2));
+stq_p(>ymmh[i][1],env->xmm_regs[i].XMM_Q(3));
+stq_p(_hi256->zmm_hi256[i][0], env->xmm_regs[i].XMM_Q(4));
+st

[for-2.6 PATCH 1/3] target-i386: Define structs for layout of xsave area

2015-11-28 Thread Eduardo Habkost
Add structs that define the layout of the xsave areas used by
Intel processors. Add some QEMU_BUILD_BUG_ON lines to ensure the
structs match the XSAVE_* macros in target-i386/kvm.c and the
offsets and sizes at target-i386/cpu.c:ext_save_areas.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/cpu.h | 85 +++
 target-i386/kvm.c | 23 +++
 2 files changed, 108 insertions(+)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 84edfd0..3d1d01e 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -806,6 +806,91 @@ typedef struct {
 
 #define NB_OPMASK_REGS 8
 
+typedef union X86LegacyXSaveArea {
+struct {
+uint16_t fcw;
+uint16_t fsw;
+uint8_t ftw;
+uint8_t reserved;
+uint16_t fpop;
+uint64_t fpip;
+uint64_t fpdp;
+uint32_t mxcsr;
+uint32_t mxcsr_mask;
+FPReg fpregs[8];
+uint64_t xmm_regs[16][2];
+};
+uint8_t data[512];
+} X86LegacyXSaveArea;
+
+typedef struct X86XSaveHeader {
+uint64_t xstate_bv;
+uint64_t xcomp_bv;
+uint8_t reserved[48];
+} X86XSaveHeader;
+
+/* Ext. save area 2: AVX State */
+typedef struct XSaveAVX {
+uint64_t ymmh[16][2];
+} XSaveAVX;
+
+/* Ext. save area 3: BNDREG */
+typedef struct XSaveBNDREG {
+BNDReg bnd_regs[4];
+} XSaveBNDREG;
+
+/* Ext. save area 4: BNDCSR */
+typedef union XSaveBNDCSR {
+BNDCSReg bndcsr;
+uint8_t data[64];
+} XSaveBNDCSR;
+
+/* Ext. save area 5: Opmask */
+typedef struct XSaveOpmask {
+uint64_t opmask_regs[NB_OPMASK_REGS];
+} XSaveOpmask;
+
+/* Ext. save area 6: ZMM_Hi256 */
+typedef struct XSaveZMM_Hi256 {
+uint64_t zmm_hi256[16][4];
+} XSaveZMM_Hi256;
+
+/* Ext. save area 7: Hi16_ZMM */
+typedef struct XSaveHi16_ZMM {
+XMMReg hi16_zmm[16];
+} XSaveHi16_ZMM;
+
+typedef struct X86XSaveArea {
+X86LegacyXSaveArea legacy;
+X86XSaveHeader header;
+
+/* Extended save areas: */
+
+/* AVX State: */
+XSaveAVX avx_state;
+uint8_t padding[960-576-sizeof(XSaveAVX)];
+/* MPX State: */
+XSaveBNDREG bndreg_state;
+XSaveBNDCSR bndcsr_state;
+/* AVX-512 State: */
+XSaveOpmask opmask_state;
+XSaveZMM_Hi256 zmm_hi256_state;
+XSaveHi16_ZMM hi16_zmm_state;
+} X86XSaveArea;
+
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, avx_state) != 0x240);
+QEMU_BUILD_BUG_ON(sizeof(XSaveAVX) != 0x100);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, bndreg_state) != 0x3c0);
+QEMU_BUILD_BUG_ON(sizeof(XSaveBNDREG) != 0x40);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, bndcsr_state) != 0x400);
+QEMU_BUILD_BUG_ON(sizeof(XSaveBNDCSR) != 0x40);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, opmask_state) != 0x440);
+QEMU_BUILD_BUG_ON(sizeof(XSaveOpmask) != 0x40);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, zmm_hi256_state) != 0x480);
+QEMU_BUILD_BUG_ON(sizeof(XSaveZMM_Hi256) != 0x200);
+QEMU_BUILD_BUG_ON(offsetof(X86XSaveArea, hi16_zmm_state) != 0x680);
+QEMU_BUILD_BUG_ON(sizeof(XSaveHi16_ZMM) != 0x400);
+
 typedef enum TPRAccess {
 TPR_ACCESS_READ,
 TPR_ACCESS_WRITE,
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 6dc9846..ee6c213 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1218,6 +1218,29 @@ static int kvm_put_fpu(X86CPU *cpu)
 #define XSAVE_ZMM_Hi256   288
 #define XSAVE_Hi16_ZMM416
 
+#define XSAVE_BYTE_OFFSET(word_offset) \
+((word_offset)*sizeof(((struct kvm_xsave*)0)->region[0]))
+
+#define ASSERT_OFFSET(word_offset, field) \
+QEMU_BUILD_BUG_ON(XSAVE_BYTE_OFFSET(word_offset) != \
+  offsetof(X86XSaveArea, field))
+
+ASSERT_OFFSET(XSAVE_FCW_FSW, legacy.fcw);
+ASSERT_OFFSET(XSAVE_FTW_FOP, legacy.ftw);
+ASSERT_OFFSET(XSAVE_CWD_RIP, legacy.fpip);
+ASSERT_OFFSET(XSAVE_CWD_RDP, legacy.fpdp);
+ASSERT_OFFSET(XSAVE_MXCSR, legacy.mxcsr);
+ASSERT_OFFSET(XSAVE_ST_SPACE, legacy.fpregs);
+ASSERT_OFFSET(XSAVE_XMM_SPACE, legacy.xmm_regs);
+ASSERT_OFFSET(XSAVE_XSTATE_BV, header.xstate_bv);
+ASSERT_OFFSET(XSAVE_YMMH_SPACE, avx_state);
+ASSERT_OFFSET(XSAVE_BNDREGS, bndreg_state);
+ASSERT_OFFSET(XSAVE_BNDCSR, bndcsr_state);
+ASSERT_OFFSET(XSAVE_OPMASK, opmask_state);
+ASSERT_OFFSET(XSAVE_ZMM_Hi256, zmm_hi256_state);
+ASSERT_OFFSET(XSAVE_Hi16_ZMM, hi16_zmm_state);
+
+
 static int kvm_put_xsave(X86CPU *cpu)
 {
 CPUX86State *env = >env;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[for-2.6 PATCH 0/3] target-i386: Use C struct for xsave area layout, offsets & sizes

2015-11-28 Thread Eduardo Habkost
target-i386/cpu.c:ext_save_area uses magic numbers for the xsave
area offets and sizes, and target-i386/kvm.c:kvm_{put,get}_xsave()
uses offset macros and bit manipulation to access the xsave area.
This series changes both to use C structs for the same
operations.

I still need to figure out a way to write unit tests for the new
code. Maybe I will just copy and paste the new and old functions,
and test them locally (checking if they give the same results
when translating blobs of random bytes).

Eduardo Habkost (3):
  target-i386: Define structs for layout of xsave area
  target-i386: Use xsave structs for ext_save_area
  target-i386: kvm: Use X86XSaveArea struct for xsave save/load

 target-i386/cpu.c |  18 +---
 target-i386/cpu.h |  85 ++
 target-i386/kvm.c | 121 +++---
 3 files changed, 149 insertions(+), 75 deletions(-)

-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[for-2.6 PATCH 2/3] target-i386: Use xsave structs for ext_save_area

2015-11-28 Thread Eduardo Habkost
This doesn't introduce any change in the code, as the offsets and
struct sizes match what was present in the table. This can be
validated by the QEMU_BUILD_BUG_ON lines on target-i386/cpu.h,
which ensures the struct sizes and offsets match the existing
values in ext_save_area.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/cpu.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 11e5e39..bc95437 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -458,17 +458,23 @@ typedef struct ExtSaveArea {
 
 static const ExtSaveArea ext_save_areas[] = {
 [2] = { .feature = FEAT_1_ECX, .bits = CPUID_EXT_AVX,
-.offset = 0x240, .size = 0x100 },
+.offset = offsetof(X86XSaveArea, avx_state),
+.size = sizeof(XSaveAVX) },
 [3] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_MPX,
-.offset = 0x3c0, .size = 0x40  },
+.offset = offsetof(X86XSaveArea, bndreg_state),
+.size = sizeof(XSaveBNDREG)  },
 [4] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_MPX,
-.offset = 0x400, .size = 0x40  },
+.offset = offsetof(X86XSaveArea, bndcsr_state),
+.size = sizeof(XSaveBNDCSR)  },
 [5] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_AVX512F,
-.offset = 0x440, .size = 0x40 },
+.offset = offsetof(X86XSaveArea, opmask_state),
+.size = sizeof(XSaveOpmask) },
 [6] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_AVX512F,
-.offset = 0x480, .size = 0x200 },
+.offset = offsetof(X86XSaveArea, zmm_hi256_state),
+.size = sizeof(XSaveZMM_Hi256) },
 [7] = { .feature = FEAT_7_0_EBX, .bits = CPUID_7_0_EBX_AVX512F,
-.offset = 0x680, .size = 0x400 },
+.offset = offsetof(X86XSaveArea, hi16_zmm_state),
+.size = sizeof(XSaveHi16_ZMM) },
 };
 
 const char *get_register_name_32(unsigned int reg)
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 1/3] target-i386: fallback vcpu's TSC rate to value returned by KVM

2015-11-26 Thread Eduardo Habkost
On Tue, Nov 24, 2015 at 11:33:55AM +0800, Haozhong Zhang wrote:
> If no user-specified TSC rate is present, we will try to set
> env->tsc_khz to the value returned by KVM_GET_TSC_KHZ. This patch does
> not change the current functionality of QEMU and just prepares for later
> patches to enable migrating vcpu's TSC rate.
> 
> Signed-off-by: Haozhong Zhang <haozhong.zh...@intel.com>

Reviewed-by: Eduardo Habkost <ehabk...@redhat.com>

> ---
>  target-i386/kvm.c | 14 ++
>  1 file changed, 14 insertions(+)
> 
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 2a9953b..a0fe9d4 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -832,6 +832,20 @@ int kvm_arch_init_vcpu(CPUState *cs)
>  }
>  }
>  
> +/* vcpu's TSC frequency is either specified by user, or following
> + * the value used by KVM if the former is not present. In the
> + * latter case, we query it from KVM and record in env->tsc_khz,
> + * so that vcpu's TSC frequency can be migrated later via this field.
> + */
> +if (!env->tsc_khz) {
> +r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
> +kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
> +-ENOTSUP;
> +if (r > 0) {
> +env->tsc_khz = r;
> +}
> +}
> +
>  if (has_xsave) {
>  env->kvm_xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
>  }
> -- 
> 2.4.8
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 2/3] target-i386: reorganize TSC rate setting code

2015-11-26 Thread Eduardo Habkost
On Tue, Nov 24, 2015 at 11:33:56AM +0800, Haozhong Zhang wrote:
> Following changes are made to the TSC rate setting code in
> kvm_arch_init_vcpu():
>  * The code is moved to a new function kvm_arch_set_tsc_khz().
>  * If kvm_arch_set_tsc_khz() fails, i.e. following two conditions are
>both satisfied:
>* KVM does not support the TSC scaling or it fails to set vcpu's
>  TSC rate by KVM_SET_TSC_KHZ,
>* the TSC rate to be set is different than the value currently used
>  by KVM,
>then kvm_arch_init_vcpu() will fail. Prevously,
>* the lack of TSC scaling never failed kvm_arch_init_vcpu(),
>* the failure of KVM_SET_TSC_KHZ failed kvm_arch_init_vcpu()
>  unconditionally, even though the TSC rate to be set is identical
>  to the value currently used by KVM.
> 
> Signed-off-by: Haozhong Zhang <haozhong.zh...@intel.com>

Reviewed-by: Eduardo Habkost <ehabk...@redhat.com>

> ---
>  target-i386/kvm.c | 40 +---
>  1 file changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index a0fe9d4..1e811ee 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -524,6 +524,36 @@ static bool hyperv_enabled(X86CPU *cpu)
>  cpu->hyperv_runtime);
>  }
>  
> +static int kvm_arch_set_tsc_khz(CPUState *cs)
> +{
> +X86CPU *cpu = X86_CPU(cs);
> +CPUX86State *env = >env;
> +int r;
> +
> +if (!env->tsc_khz) {
> +return 0;
> +}
> +
> +r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ?
> +kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
> +-ENOTSUP;
> +if (r < 0) {
> +/* When KVM_SET_TSC_KHZ fails, it's an error only if the current
> + * TSC frequency doesn't match the one we want.
> + */
> +int cur_freq = kvm_check_extension(cs->kvm_state, 
> KVM_CAP_GET_TSC_KHZ) ?
> +   kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
> +   -ENOTSUP;
> +if (cur_freq <= 0 || cur_freq != env->tsc_khz) {
> +error_report("warning: TSC frequency mismatch between "
> + "VM and host, and TSC scaling unavailable");
> +return r;
> +}
> +}
> +
> +return 0;
> +}
> +
>  static Error *invtsc_mig_blocker;
>  
>  #define KVM_MAX_CPUID_ENTRIES  100
> @@ -823,13 +853,9 @@ int kvm_arch_init_vcpu(CPUState *cs)
>  return r;
>  }
>  
> -r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL);
> -if (r && env->tsc_khz) {
> -r = kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz);
> -if (r < 0) {
> -fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");
> -return r;
> -}
> +r = kvm_arch_set_tsc_khz(cs);
> +if (r < 0) {
> +return r;
>  }
>  
>  /* vcpu's TSC frequency is either specified by user, or following
> -- 
> 2.4.8
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 3/3] target-i386: add support to migrate vcpu's TSC rate

2015-11-26 Thread Eduardo Habkost
On Tue, Nov 24, 2015 at 11:33:57AM +0800, Haozhong Zhang wrote:
> This patch enables migrating vcpu's TSC rate. If KVM on the destination
> machine supports TSC scaling, guest programs will observe a consistent
> TSC rate across the migration.
> 
> If TSC scaling is not supported on the destination machine, the
> migration will not be aborted and QEMU on the destination will not set
> vcpu's TSC rate to the migrated value.
> 
> If vcpu's TSC rate specified by CPU option 'tsc-freq' on the destination
> machine is inconsistent with the migrated TSC rate, the migration will
> be aborted.
> 
> For backwards compatibility, the migration of vcpu's TSC rate is
> disabled on pc-*-2.4 and older machine types.
> 
> Signed-off-by: Haozhong Zhang <haozhong.zh...@intel.com>

Assuming the PC compat code will be moved to
pc_*_2_5_machine_options(), because the patch will be included
after QEMU 2.5.0:

Reviewed-by: Eduardo Habkost <ehabk...@redhat.com>

One comment below:

> ---
[...]
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 1e811ee..2a0fd54 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -2381,6 +2381,28 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
>  }
>  }
>  
> +if (level == KVM_PUT_FULL_STATE) {
> +/* kvm_arch_set_tsc_khz() below can be called in two control flows 
> and
> + * we don't need to handle its errors in both of them.
> + *
> + * One is the control flow that creates a vcpu, where
> + * kvm_arch_set_tsc_khz() has already been called once before by
> + * kvm_arch_init_vcpu(). The latter will abort the control flow if 
> there
> + * are any errors of kvm_arch_set_tsc_khz(). Thus, in this control 
> flow,
> + * kvm_arch_set_tsc_khz() below never fails and we can safely ignore 
> its
> + * return values here.
> + *
> + * Another is the control flow of migration that sets vcpu's TSC
> + * frequency on the destination. The only error that can fail the
> + * migration is the mismatch between the migrated and the 
> user-specified
> + * TSC frequencies, which has been handled by cpu_post_load(). Other
> + * errors, i.e. those from kvm_arch_set_tsc_khz(), never fail the
> + * migration, so we also safely ignore its return values in this 
> control
> + * flow.
> + */

This could be more succint. Something like:

/* We don't check for kvm_arch_set_tsc_khz() errors here, because
 * TSC frequency mismatch shouldn't abort migration, unless the
 * user explicitly asked for a more strict TSC setting (e.g.
 * using an explicit "tsc-freq" option).
 */

No need to resubmit because of that, though. The comment can be
changed when applying the patch.

> +kvm_arch_set_tsc_khz(cpu);
> +}
> +
>  ret = kvm_getput_regs(x86_cpu, 1);
>  if (ret < 0) {
>  return ret;
> diff --git a/target-i386/machine.c b/target-i386/machine.c
> index a18e16e..e560ca3 100644
> --- a/target-i386/machine.c
> +++ b/target-i386/machine.c
> @@ -6,6 +6,8 @@
>  #include "cpu.h"
>  #include "sysemu/kvm.h"
>  
> +#include "qemu/error-report.h"
> +
>  static const VMStateDescription vmstate_segment = {
>  .name = "segment",
>  .version_id = 1,
> @@ -331,6 +333,13 @@ static int cpu_post_load(void *opaque, int version_id)
>  CPUX86State *env = >env;
>  int i;
>  
> +if (env->tsc_khz && env->user_tsc_khz &&
> +env->tsc_khz != env->user_tsc_khz) {
> +error_report("Mismatch between user-specified TSC frequency and "
> + "migrated TSC frequency");
> +return -EINVAL;
> +}
> +
>  /*
>   * Real mode guest segments register DPL should be zero.
>   * Older KVM version were setting it wrongly.
> @@ -775,6 +784,26 @@ static const VMStateDescription vmstate_xss = {
>  }
>  };
>  
> +static bool tsc_khz_needed(void *opaque)
> +{
> +X86CPU *cpu = opaque;
> +CPUX86State *env = >env;
> +MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine());
> +PCMachineClass *pcmc = PC_MACHINE_CLASS(mc);
> +return env->tsc_khz && pcmc->save_tsc_khz;
> +}
> +
> +static const VMStateDescription vmstate_tsc_khz = {
> +.name = "cpu/tsc_khz",
> +.version_id = 1,
> +.minimum_version_id = 1,
> +.needed = tsc_khz_needed,
> +.fields = (VMStateField[]) {
> +VMSTATE_INT64(env.tsc_khz, X86CPU),
> +VMSTATE_END_OF_LIST()
> +}
> +};
> +
>  VMStateDescription vmstate_x86_cpu = {
>  .name = "cpu",
>  .version_id = 12,
> @@ -895,6 +924,7 @@ VMStateDescription vmstate_x86_cpu = {
>  _msr_hyperv_runtime,
>  _avx512,
>  _xss,
> +_tsc_khz,
>  NULL
>  }
>  };
> -- 
> 2.4.8
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] target-i386: kvm: Print warning when clearing mcg_cap bits

2015-11-25 Thread Eduardo Habkost
On Wed, Nov 25, 2015 at 05:45:20PM +0100, Paolo Bonzini wrote:
> On 25/11/2015 16:49, Eduardo Habkost wrote:
> > Instead of silently clearing mcg_cap bits when the host doesn't
> > support them, print a warning when doing that.
> > 
> > Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
> > ---
> >  target-i386/kvm.c | 8 +++-
> >  1 file changed, 7 insertions(+), 1 deletion(-)
> > 
> > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > index d63a85b..446bdfc 100644
> > --- a/target-i386/kvm.c
> > +++ b/target-i386/kvm.c
> > @@ -774,7 +774,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
> >  && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
> > (CPUID_MCE | CPUID_MCA)
> >  && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
> > -uint64_t mcg_cap;
> > +uint64_t mcg_cap, unsupported_caps;
> >  int banks;
> >  int ret;
> >  
> > @@ -790,6 +790,12 @@ int kvm_arch_init_vcpu(CPUState *cs)
> >  return -ENOTSUP;
> >  }
> >  
> > +unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK);
> > +if (unsupported_caps) {
> > +error_report("warning: Unsupported MCG_CAP bits: 0x%" PRIx64 
> > "\n",
> 
> \n should not be at end of error_report.
> 
> Fixed and applied.

MCG_CAP_BANKS_MASK is defined by patch 2/3. Have you applied the
whole series?

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] target-i386: kvm: Print warning when clearing mcg_cap bits

2015-11-25 Thread Eduardo Habkost
Instead of silently clearing mcg_cap bits when the host doesn't
support them, print a warning when doing that.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/kvm.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index d63a85b..446bdfc 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -774,7 +774,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
(CPUID_MCE | CPUID_MCA)
 && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
-uint64_t mcg_cap;
+uint64_t mcg_cap, unsupported_caps;
 int banks;
 int ret;
 
@@ -790,6 +790,12 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return -ENOTSUP;
 }
 
+unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK);
+if (unsupported_caps) {
+error_report("warning: Unsupported MCG_CAP bits: 0x%" PRIx64 "\n",
+ unsupported_caps);
+}
+
 env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK;
 ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, >mcg_cap);
 if (ret < 0) {
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] target-i386: kvm: Use env->mcg_cap when setting up MCE

2015-11-25 Thread Eduardo Habkost
When setting up MCE, instead of using the MCE_*_DEF macros
directly, just filter the existing env->mcg_cap value.

As env->mcg_cap is already initialized as
MCE_CAP_DEF|MCE_BANKS_DEF at target-i386/cpu.c:mce_init(), this
doesn't change any behavior. But it will allow us to change
mce_init() in the future, to implement different defaults
depending on CPU model, machine-type or command-line parameters.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/cpu.h |  2 ++
 target-i386/kvm.c | 11 ---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index fc4a605..84edfd0 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -286,6 +286,8 @@
 #define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P)
 #define MCE_BANKS_DEF   10
 
+#define MCG_CAP_BANKS_MASK 0xff
+
 #define MCG_STATUS_RIPV (1ULL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV (1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP (1ULL<<2)   /* machine check in progress */
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index ee7bc69..d63a85b 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -784,21 +784,18 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return ret;
 }
 
-if (MCE_BANKS_DEF > banks) {
+if ((env->mcg_cap & MCG_CAP_BANKS_MASK) > banks) {
 error_report("kvm: Unsupported MCE bank count: %d > %d\n",
- MCE_BANKS_DEF, banks);
+ (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks);
 return -ENOTSUP;
 }
 
-mcg_cap &= MCE_CAP_DEF;
-mcg_cap |= MCE_BANKS_DEF;
-ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, _cap);
+env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK;
+ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, >mcg_cap);
 if (ret < 0) {
 fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
 return ret;
 }
-
-env->mcg_cap = mcg_cap;
 }
 
 qemu_add_vm_change_state_handler(cpu_update_state, env);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] target-i386: kvm: Abort if MCE bank count is not supported by host

2015-11-25 Thread Eduardo Habkost
Instead of silently changing the number of banks in mcg_cap based
on kvm_get_mce_cap_supported(), abort initialization if the host
doesn't support MCE_BANKS_DEF banks.

Note that MCE_BANKS_DEF was always 10 since it was introduced in
QEMU, and Linux always returned 32 at KVM_CAP_MCE since
KVM_CAP_MCE was introduced, so no behavior is being changed and
the error can't be triggered by any Linux version. The point of
the new check is to ensure we won't silently change the bank
count if we change MCE_BANKS_DEF or make the bank count
configurable in the future.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/kvm.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 2a9953b..ee7bc69 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -784,11 +784,14 @@ int kvm_arch_init_vcpu(CPUState *cs)
 return ret;
 }
 
-if (banks > MCE_BANKS_DEF) {
-banks = MCE_BANKS_DEF;
+if (MCE_BANKS_DEF > banks) {
+error_report("kvm: Unsupported MCE bank count: %d > %d\n",
+ MCE_BANKS_DEF, banks);
+return -ENOTSUP;
 }
+
 mcg_cap &= MCE_CAP_DEF;
-mcg_cap |= banks;
+mcg_cap |= MCE_BANKS_DEF;
 ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, _cap);
 if (ret < 0) {
 fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] target-i386: kvm: Use env->mcg_cap when setting up MCE

2015-11-25 Thread Eduardo Habkost
Instead of overwriting env->mcg_cap, make kvm_arch_init_vcpu(),
use the value already set at the CPU object when initializing
MCE.

Except for the new "unsupported MCG_CAPS bits" warning, this
patch doesn't change any of the existing QEMU behavior. The
previous code set env->mcg_cap to:
  (MCE_CAP_DEF & ioctl(KVM_X86_GET_MCE_CAP_SUPPORTED)) | MCE_BANKS_DEF
and the new code still keeps it exactly the same, as env->mcg_cap
is already initialized as MCE_CAP_DEF|MCE_BANKS_DEF at
mce_init().

This will allow us to change mce_init() in the future, to
implement different defaults depending on CPU model, machine-type
or command-line parameters.

Eduardo Habkost (3):
  target-i386: kvm: Abort if MCE bank count is not supported by host
  target-i386: kvm: Use env->mcg_cap when setting up MCE
  target-i386: kvm: Print warning when clearing mcg_cap bits

 target-i386/cpu.h |  2 ++
 target-i386/kvm.c | 22 ++
 2 files changed, 16 insertions(+), 8 deletions(-)

-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] target-i386: kvm: Abort if MCE bank count is not supported by host

2015-11-25 Thread Eduardo Habkost
On Wed, Nov 25, 2015 at 05:46:38PM +0100, Paolo Bonzini wrote:
> 
> 
> On 25/11/2015 16:49, Eduardo Habkost wrote:
> > Instead of silently changing the number of banks in mcg_cap based
> > on kvm_get_mce_cap_supported(), abort initialization if the host
> > doesn't support MCE_BANKS_DEF banks.
> > 
> > Note that MCE_BANKS_DEF was always 10 since it was introduced in
> > QEMU, and Linux always returned 32 at KVM_CAP_MCE since
> > KVM_CAP_MCE was introduced, so no behavior is being changed and
> > the error can't be triggered by any Linux version. The point of
> > the new check is to ensure we won't silently change the bank
> > count if we change MCE_BANKS_DEF or make the bank count
> > configurable in the future.
> > 
> > Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
> > ---
> >  target-i386/kvm.c | 9 ++---
> >  1 file changed, 6 insertions(+), 3 deletions(-)
> > 
> > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > index 2a9953b..ee7bc69 100644
> > --- a/target-i386/kvm.c
> > +++ b/target-i386/kvm.c
> > @@ -784,11 +784,14 @@ int kvm_arch_init_vcpu(CPUState *cs)
> >  return ret;
> >  }
> >  
> > -if (banks > MCE_BANKS_DEF) {
> > -banks = MCE_BANKS_DEF;
> > +if (MCE_BANKS_DEF > banks) {
> > +error_report("kvm: Unsupported MCE bank count: %d > %d\n",
> > + MCE_BANKS_DEF, banks);
> 
> Yoda conditions?
> 
> if (banks < MCE_BANKS_DEF) {
> error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = 
> %d)",
>  MCE_BANKS_DEF, banks);

This was on purpose, because MCE_BANKS_DEF is replaced by
(env->mcg_caps & MCG_CAPS_COUNT_MASK) in the next patch.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 1/3] target-i386: fallback vcpu's TSC rate to value returned by KVM

2015-11-25 Thread Eduardo Habkost
On Tue, Nov 24, 2015 at 11:33:55AM +0800, Haozhong Zhang wrote:
> If no user-specified TSC rate is present, we will try to set
> env->tsc_khz to the value returned by KVM_GET_TSC_KHZ. This patch does
> not change the current functionality of QEMU and just prepares for later
> patches to enable migrating vcpu's TSC rate.
> 
> Signed-off-by: Haozhong Zhang <haozhong.zh...@intel.com>

Reviewed-by: Eduardo Habkost <ehabk...@redhat.com>

> ---
>  target-i386/kvm.c | 14 ++
>  1 file changed, 14 insertions(+)
> 
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 2a9953b..a0fe9d4 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -832,6 +832,20 @@ int kvm_arch_init_vcpu(CPUState *cs)
>  }
>  }
>  
> +/* vcpu's TSC frequency is either specified by user, or following
> + * the value used by KVM if the former is not present. In the
> + * latter case, we query it from KVM and record in env->tsc_khz,
> + * so that vcpu's TSC frequency can be migrated later via this field.
> + */
> +if (!env->tsc_khz) {
> +r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
> +kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
> +-ENOTSUP;
> +if (r > 0) {
> +env->tsc_khz = r;
> +}
> +}
> +
>  if (has_xsave) {
>  env->kvm_xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
>  }
> -- 
> 2.4.8
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: MCG_CAP ABI breakage (was Re: [Qemu-devel] [PATCH] target-i386: Do not set MCG_SER_P by default)

2015-11-24 Thread Eduardo Habkost
On Mon, Nov 23, 2015 at 05:43:14PM +0100, Borislav Petkov wrote:
> On Mon, Nov 23, 2015 at 01:11:27PM -0200, Eduardo Habkost wrote:
> > On Mon, Nov 23, 2015 at 11:22:37AM -0200, Eduardo Habkost wrote:
> > [...]
> > > In the case of this code, it looks like it's already broken
> > > because the resulting mcg_cap depends on host kernel capabilities
> > > (the ones reported by kvm_get_mce_cap_supported()), and the data
> > > initialized by target-i386/cpu.c:mce_init() is silently
> > > overwritten by kvm_arch_init_vcpu(). So we would need to fix that
> > > before implementing a proper compatibility mechanism for
> > > mcg_cap.
> > 
> > Fortunately, when running Linux v2.6.37 and later,
> > kvm_arch_init_vcpu() won't actually change mcg_cap (see details
> > below).
> > 
> > But the code is broken if running on Linux between v2.6.32 and
> > v2.6.36: it will clear MCG_SER_P silently (and silently enable
> > MCG_SER_P when migrating to a newer host).
> > 
> > But I don't know what we should do on those cases. If we abort
> > initialization when the host doesn't support MCG_SER_P, all CPU
> > models with MCE and MCA enabled will become unrunnable on Linux
> > between v2.6.32 and v2.6.36. Should we do that, and simply ask
> > people to upgrade their kernels (or explicitly disable MCE) if
> > they want to run latest QEMU?
> > 
> > For reference, these are the capabilities returned by Linux:
> > * KVM_MAX_MCE_BANKS is 32 since
> >   890ca9aefa78f7831f8f633cab9e4803636dffe4 (v2.6.32-rc1~693^2~199)
> > * KVM_MCE_CAP_SUPPORTED is (MCG_CTL_P | MCG_SER_P) since
> >   5854dbca9b235f8cdd414a0961018763d2d5bf77 (v2.6.37-rc1~142^2~3)
> 
> The commit message of that one says that there is MCG_SER_P support in
> the kernel.
> 
> The previous commit talks about MCE injection with KVM_X86_SET_MCE
> ioctl but frankly, I don't see that. From looking at the current code,
> KVM_X86_SET_MCE does kvm_vcpu_ioctl_x86_setup_mce() which simply sets
> MCG_CAP. And it gets those from KVM_X86_GET_MCE_CAP_SUPPORTED which is

KVM_X86_SET_MCE does not call kvm_vcpu_ioctl_x86_setup_mce(). It
calls kvm_vcpu_ioctl_x86_set_mce(), which stores the
IA32_MCi_{STATUS,ADDR,MISC} register contents at
vcpu->arch.mce_banks.

> 
> #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
> 
> So it basically sets those two supported bits. But how is
> 
>   supported == actually present
> 
> ?!?!
> 
> That soo doesn't make any sense.
> 

I didn't check the QEMU MCE code to confirm that, but I assume it
is implemented there. In that case, MCG_SER_P in
KVM_MCE_CAP_SUPPORTED just indicates it can be implemented by
userspace, as long as it makes the appropriate KVM_X86_SET_MCE
(or maybe KVM_SET_MSRS?) calls.


-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: MCG_CAP ABI breakage (was Re: [Qemu-devel] [PATCH] target-i386: Do not set MCG_SER_P by default)

2015-11-23 Thread Eduardo Habkost
On Mon, Nov 23, 2015 at 05:43:14PM +0100, Borislav Petkov wrote:
> On Mon, Nov 23, 2015 at 01:11:27PM -0200, Eduardo Habkost wrote:
> > On Mon, Nov 23, 2015 at 11:22:37AM -0200, Eduardo Habkost wrote:
> > [...]
> > > In the case of this code, it looks like it's already broken
> > > because the resulting mcg_cap depends on host kernel capabilities
> > > (the ones reported by kvm_get_mce_cap_supported()), and the data
> > > initialized by target-i386/cpu.c:mce_init() is silently
> > > overwritten by kvm_arch_init_vcpu(). So we would need to fix that
> > > before implementing a proper compatibility mechanism for
> > > mcg_cap.
> > 
> > Fortunately, when running Linux v2.6.37 and later,
> > kvm_arch_init_vcpu() won't actually change mcg_cap (see details
> > below).
> > 
> > But the code is broken if running on Linux between v2.6.32 and
> > v2.6.36: it will clear MCG_SER_P silently (and silently enable
> > MCG_SER_P when migrating to a newer host).
> > 
> > But I don't know what we should do on those cases. If we abort
> > initialization when the host doesn't support MCG_SER_P, all CPU
> > models with MCE and MCA enabled will become unrunnable on Linux
> > between v2.6.32 and v2.6.36. Should we do that, and simply ask
> > people to upgrade their kernels (or explicitly disable MCE) if
> > they want to run latest QEMU?
> > 
> > For reference, these are the capabilities returned by Linux:
> > * KVM_MAX_MCE_BANKS is 32 since
> >   890ca9aefa78f7831f8f633cab9e4803636dffe4 (v2.6.32-rc1~693^2~199)
> > * KVM_MCE_CAP_SUPPORTED is (MCG_CTL_P | MCG_SER_P) since
> >   5854dbca9b235f8cdd414a0961018763d2d5bf77 (v2.6.37-rc1~142^2~3)
> 
> The commit message of that one says that there is MCG_SER_P support in
> the kernel.
> 
> The previous commit talks about MCE injection with KVM_X86_SET_MCE
> ioctl but frankly, I don't see that. From looking at the current code,
> KVM_X86_SET_MCE does kvm_vcpu_ioctl_x86_setup_mce() which simply sets
> MCG_CAP. And it gets those from KVM_X86_GET_MCE_CAP_SUPPORTED which is
> 
> #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
> 
> So it basically sets those two supported bits. But how is
> 
>   supported == actually present
> 
> ?!?!
> 
> That soo doesn't make any sense.

I will let the people working on the actual MCE emulation in KVM
answer that. I am assuming that KVM_MCE_CAP_SUPPORTED is set to
something that makes sense.

> 
> > * KVM_MCE_CAP_SUPPORTED is MCG_CTL_P between
> >   890ca9aefa78f7831f8f633cab9e4803636dffe4 (v2.6.32-rc1~693^2~199)
> >   and 5854dbca9b235f8cdd414a0961018763d2d5bf77 (v2.6.37-rc1~142^2~3)
> > 
> > The current definitions in QEMU are:
> > #define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P)
> > #define MCE_BANKS_DEF   10
> 
> That's also wrong. The number of banks is, of course,
> generation-specific.

Note that we don't mimick every single detail of real CPUs out
there, and this isn't necessarily a problem (although sometimes
we choose bad defaults). Do you see real world scenarios when
choosing 10 as the default causes problems for guest OSes, or you
just worry that this might cause problems because it doesn't
match any real-world CPU?

The number of banks is whatever QEMU chooses to be the number of
banks. The pc-*-2.4 and older machine-types already expose 10
banks, but we can change it in future machine-types.


> The qemu commit adding that is
> 
> 79c4f6b08009 ("QEMU: MCE: Add MCE simulation to qemu/tcg")
> 
> and I really don't get what the reasoning behind it is? Is this supposed
> to mimick a certain default CPU which is not really resembling a real
> CPU or some generic CPU or what is it?

I don't know the reasoning behind those defaults, but that's the
existing default in pc-*-2.4 and older.

> 
> Because the moment you start qemu with -cpu , all that
> MCA information is totally wrong.

If we really care about matching the number of banks of real
CPUs, we can make it configurable, defined by the CPU model,
and/or have better defaults in future machine-types. That won't
be a problem.

But I still don't know what we should do when somebody runs:
  -machine pc-i440fx-2.4 -cpu Penryn
on a host kernel that doesn't report MCG_SER_P on
KVM_MCE_CAP_SUPPORTED.

> 
> > The target-i386/cpu.c:mce_init() code sets mcg_cap to:
> > env->mcg_cap == MCE_CAP_DEF | MCE_BANKS_DEF;
> >  == (MCG_CTL_P|MCG_SER_P) | 10;
> > 
> > The kvm_arch_init_vcpu() code that changes mcg_cap does the following:
> > kvm_get_mce_cap_supported(cs->kvm_state, _cap, );
> > if (banks > MCE_BANKS_DEF) {
> > banks = MCE_BANKS_DEF;
> >   

Re: [PATCH] target-i386: Do not set MCG_SER_P by default

2015-11-23 Thread Eduardo Habkost
On Sat, Nov 21, 2015 at 02:09:25AM +0100, Borislav Petkov wrote:
> On Sat, Nov 21, 2015 at 12:11:35AM +0100, Andreas Färber wrote:
> > Hi,
> > 
> > CC'ing qemu-devel.
> 
> Ah, thanks.
> 
> > Am 21.11.2015 um 00:01 schrieb Borislav Petkov:
> > > From: Borislav Petkov 
> > > 
> > > Software Error Recovery, i.e. SER, is purely an Intel feature and it
> > > shouldn't be set by default. Enable it only on Intel.

What happens when SER is enabled on an AMD CPU? If it really
should't be enabled, why is KVM returning it on
KVM_X86_GET_MCE_CAP_SUPPORTED?

> > 
> > Is this new in 2.5? Otherwise we would probably need compatibility code
> > in pc*.[ch] for incoming live migration from older versions.
> 
> It looks it is really old, AFAIK from 2010:
> 
> c0532a76b407 ("MCE: Relay UCR MCE to guest")
> 
> You'd need to be more verbose about pc*.[ch]. An example perhaps...?

If you change something that's guest-visible and not part of the
migration stream, you need to keep the old behavior on older
machine-types (e.g. pc-i440fx-2.4), or the CPU will change under
the guest's feet when migrating to another host.

For examples, see the recent commits to include/hw/i386/pc.h.
They are all about keeping compatibility when CPUID bits are
changed:

33b5e8c0 target-i386: Disable rdtscp on Opteron_G* CPU models
6aa91e4a target-i386: Remove POPCNT from qemu64 and qemu32 CPU models
71195672 target-i386: Remove ABM from qemu64 CPU model
0909ad24 target-i386: Remove SSE4a from qemu64 CPU model

In the case of this code, it looks like it's already broken
because the resulting mcg_cap depends on host kernel capabilities
(the ones reported by kvm_get_mce_cap_supported()), and the data
initialized by target-i386/cpu.c:mce_init() is silently
overwritten by kvm_arch_init_vcpu(). So we would need to fix that
before implementing a proper compatibility mechanism for
mcg_cap.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


MCG_CAP ABI breakage (was Re: [Qemu-devel] [PATCH] target-i386: Do not set MCG_SER_P by default)

2015-11-23 Thread Eduardo Habkost
On Mon, Nov 23, 2015 at 11:22:37AM -0200, Eduardo Habkost wrote:
[...]
> In the case of this code, it looks like it's already broken
> because the resulting mcg_cap depends on host kernel capabilities
> (the ones reported by kvm_get_mce_cap_supported()), and the data
> initialized by target-i386/cpu.c:mce_init() is silently
> overwritten by kvm_arch_init_vcpu(). So we would need to fix that
> before implementing a proper compatibility mechanism for
> mcg_cap.

Fortunately, when running Linux v2.6.37 and later,
kvm_arch_init_vcpu() won't actually change mcg_cap (see details
below).

But the code is broken if running on Linux between v2.6.32 and
v2.6.36: it will clear MCG_SER_P silently (and silently enable
MCG_SER_P when migrating to a newer host).

But I don't know what we should do on those cases. If we abort
initialization when the host doesn't support MCG_SER_P, all CPU
models with MCE and MCA enabled will become unrunnable on Linux
between v2.6.32 and v2.6.36. Should we do that, and simply ask
people to upgrade their kernels (or explicitly disable MCE) if
they want to run latest QEMU?

For reference, these are the capabilities returned by Linux:
* KVM_MAX_MCE_BANKS is 32 since
  890ca9aefa78f7831f8f633cab9e4803636dffe4 (v2.6.32-rc1~693^2~199)
* KVM_MCE_CAP_SUPPORTED is (MCG_CTL_P | MCG_SER_P) since
  5854dbca9b235f8cdd414a0961018763d2d5bf77 (v2.6.37-rc1~142^2~3)
* KVM_MCE_CAP_SUPPORTED is MCG_CTL_P between
  890ca9aefa78f7831f8f633cab9e4803636dffe4 (v2.6.32-rc1~693^2~199)
  and 5854dbca9b235f8cdd414a0961018763d2d5bf77 (v2.6.37-rc1~142^2~3)

The current definitions in QEMU are:
#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P)
#define MCE_BANKS_DEF   10

The target-i386/cpu.c:mce_init() code sets mcg_cap to:
env->mcg_cap == MCE_CAP_DEF | MCE_BANKS_DEF;
 == (MCG_CTL_P|MCG_SER_P) | 10;

The kvm_arch_init_vcpu() code that changes mcg_cap does the following:
kvm_get_mce_cap_supported(cs->kvm_state, _cap, );
if (banks > MCE_BANKS_DEF) {
banks = MCE_BANKS_DEF;
}
mcg_cap &= MCE_CAP_DEF;
mcg_cap |= banks;
env->mcg_cap = mcg_cap;
  * Therefore, if running Linux v2.6.37 or newer, this will be
the result:
banks == 10;
mcg_cap == (MCG_CTL_P|MCG_SER_P) | banks
== (MCG_CTL_P|MCG_SER_P) | 10;
* That's the same value set by mce_init(), so fortunately
  kvm_arch_init_vcpu() isn't actually changing mcg_cap
  if running Linux v.2.6.37 and newer.
  * However, if running Linux between v2.6.32 and v2.6.37,
kvm_arch_init_vcpu() will silently clear MCG_SER_P.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v3 1/3] target-i386: add pkeys support for cpuid handling

2015-11-19 Thread Eduardo Habkost
On Thu, Nov 19, 2015 at 12:10:49PM +0100, Paolo Bonzini wrote:
> 
> 
> On 19/11/2015 07:36, Han, Huaitong wrote:
> > I understand it has always been that QEMU considers the feature of
> >  cpuid_7_0_ecx_feature_name as migratable. If the feature is
> >  unmigratable, it will been added to unmigratable_flags.
> > 
> > A series of patches do complete a full function, moving
> >  cpuid_7_0_ecx_feature_name to 2/3 patch may make 2/3 patch look
> > better, but make 1/3 patch look somewhat incomplete.
> > 
> > Maybe it is a solution that adding the feature to unmigratable_flags in
> > 1/3 patch, and deleting unmigratable_flags in 2/3 patch, but I think it
> > is pointless.
> 
> Or just squash everything together.  After all we're talking of
> 
>  4 files changed, 55 insertions(+), 1 deletion(-)
> 
> It's not a large patch.

It makes sense. Adding the state to X86CPU (2/3) is useful only
if we migrate it (3/3), and adding the feature names (1/3) is
useful only if we can handle the new state.

I will squash everything together when applying, in case there's
no new version.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v3 1/3] target-i386: add pkeys support for cpuid handling

2015-11-18 Thread Eduardo Habkost
On Wed, Nov 18, 2015 at 10:20:15AM +0800, Huaitong Han wrote:
[...]
> @@ -408,6 +420,13 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] 
> = {
>  .cpuid_reg = R_EBX,
>  .tcg_features = TCG_7_0_EBX_FEATURES,
>  },
> +[FEAT_7_0_ECX] = {
> +.feat_names = cpuid_7_0_ecx_feature_name,
> +.cpuid_eax = 7,
> +.cpuid_needs_ecx = true, .cpuid_ecx = 0,
> +.cpuid_reg = R_ECX,
> +.tcg_features = TCG_7_0_ECX_FEATURES,
> +},

The patch looks good, but when we add the feature names to
cpuid_7_0_ecx_feature_name, QEMU will consider them as
migratable, but they are truly migratable only after we add the
ext_save_areas entry.

We can fix this by moving cpuid_7_0_ecx_feature_name to patch 2/3
(or to a separate patch, to be applied after 2/3).

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v3 0/3] target-i386: add memory protection-key support

2015-11-18 Thread Eduardo Habkost
On Wed, Nov 18, 2015 at 10:20:14AM +0800, Huaitong Han wrote:
> Changes in v3:
> *Fix cpuid_7_0_ecx_feature_name error.
> 
> Changes in v2:
> *Fix memcpy error for xsave state.
> *Fix TCG_7_0_ECX_FEATURES to 0.
> *Make subjects more readable.
> 
> The protection-key feature provides an additional mechanism by which IA-32e
> paging controls access to usermode addresses.
> 
> Hardware support for protection keys for user pages is enumerated with CPUID
> feature flag CPUID.7.0.ECX[3]:PKU. Software support is CPUID.7.0.ECX[4]:OSPKE
> with the setting of CR4.PKE(bit 22).
> 
> The PKRU register is XSAVE-managed state CPUID.D.0.EAX[9], the size of XSAVE
> state component for PKRU is 8 bytes, the offset is 0xa80.

Is every CPU supporting PKU guaranteed to have
CPUID.(EAX=0DH,ECX=9):EBX = 0xa80? Where is the PKRU state
offset/layout documented?

> 
> The specification of Protection Keys can be found at SDM (4.6.2, volume 3)
> http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf.
> 
[...]

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 0/5] implement vNVDIMM

2015-11-18 Thread Eduardo Habkost
On Wed, Nov 18, 2015 at 09:59:34AM +0800, Xiao Guangrong wrote:
> 
> Ping...
> 
> Do you have any comment on this patchset? Could it be applied to somewhere
> if it is okay for you?

I have no additional comments, as the memory-backend patches I
was reviewing are not included in this version. I didn't take the
time to review the TYPE_NVDIMM and ACPI changes.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 2/3] target-i386: reorganize TSC rate setting code

2015-11-17 Thread Eduardo Habkost
On Tue, Nov 17, 2015 at 01:20:38PM +0800, Haozhong Zhang wrote:
> Following two changes are made to the TSC rate setting code in
> kvm_arch_init_vcpu():
>  * The code is moved to a new function kvm_arch_set_tsc_khz().
>  * If setting user-specified TSC rate fails and the host TSC rate is
>inconsistent with the user-specified one, print a warning message.
> 
> Signed-off-by: Haozhong Zhang 

This matches what I was expecting, and now I see that we don't
even need the user_tsc_khz field.

> ---
>  target-i386/kvm.c | 45 ++---
>  1 file changed, 38 insertions(+), 7 deletions(-)
> 
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 9e4d27f..6a1acb4 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -524,6 +524,41 @@ static bool hyperv_enabled(X86CPU *cpu)
>  cpu->hyperv_runtime);
>  }
>  
> +/**
> + * Ask KVM to set vcpu's TSC rate to X86_CPU(cs)->env.tsc_khz.
> + *
> + * Returns: 0if successful;
> + *  -ENOTSUP if KVM_CAP_TSC_CONTROL is unavailable;
> + *  -EIO if KVM_SET_TSC_KHZ fails.

If KVM_SET_TSC_KHZ fails, the error code will be useful to
understand what went wrong. It's better to return the error code
returned by KVM instead of -EIO.

> + */
> +static int kvm_arch_set_tsc_khz(CPUState *cs)
> +{
> +X86CPU *cpu = X86_CPU(cs);
> +CPUX86State *env = >env;
> +int has_tsc_control, r = 0;
> +
> +if (!env->tsc_khz) {
> +return 0;
> +}
> +
> +has_tsc_control = kvm_check_extension(cs->kvm_state, 
> KVM_CAP_TSC_CONTROL);
> +if (has_tsc_control) {
> +r = kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz);
> +}
> +
> +if (!has_tsc_control || r < 0) {
> +r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
> +kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
> +if (r <= 0 || r != env->tsc_khz) {
> +error_report("warning: TSC frequency mismatch between "
> + "VM and host, and TSC scaling unavailable");
> +return has_tsc_control ? -EIO : -ENOTSUP;
> +}
> +}

What about:

r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ?
kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
-ENOTSUP;
if (r < 0) {
/* If KVM_SET_TSC_KHZ fails, it is an error only if the
 * current TSC frequency doesn't match the one we want.
 */
int cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
   kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
   -ENOTSUP;
   if (cur_freq <= 0 || cur_freq != env->tsc_khz) {
   error_report("warning: TSC frequency mismatch between "
"VM and host, and TSC scaling unavailable");
   return r;
   }
}

return 0;

> +
> +return 0;
> +}
> +
>  static Error *invtsc_mig_blocker;
>  
>  #define KVM_MAX_CPUID_ENTRIES  100
> @@ -823,13 +858,9 @@ int kvm_arch_init_vcpu(CPUState *cs)
>  return r;
>  }
>  
> -r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL);
> -if (r && env->tsc_khz) {
> -r = kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz);
> -if (r < 0) {
> -fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");
> -return r;
> -}
> +if (kvm_arch_set_tsc_khz(cs) == -EIO) {
> +fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");

Now kvm_arch_set_tsc_khz() prints an error message, we can remove
this one.

> +return -EIO;

To keep the previous behavior without losing the error code
returned by KVM, this could be written as:

r = kvm_arch_set_tsc_khz(cs);
if (r < 0 && r != -ENOTSUP) {
return r;
}

But I belive the previous behavior was wrong. If tsc-freq was
explicitly set by the user, we should abort if
KVM_CAP_TSC_CONTROL is unavailable.

So I suggest we simply do this:

r = kvm_arch_set_tsc_khz(cs);
if (r < 0) {
return r;
}

(In case you implement this behavior change in the same patch,
please mention that in the commit message.)

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 1/3] target-i386: fallback vcpu's TSC rate to value returned by KVM

2015-11-17 Thread Eduardo Habkost
On Tue, Nov 17, 2015 at 01:20:37PM +0800, Haozhong Zhang wrote:
> If no user-specified TSC rate is present, we will try to set
> env->tsc_khz to the value returned by KVM_GET_TSC_KHZ.
> 
> Signed-off-by: Haozhong Zhang 
> ---
>  target-i386/kvm.c | 12 
>  1 file changed, 12 insertions(+)
> 
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 2a9953b..9e4d27f 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -832,6 +832,18 @@ int kvm_arch_init_vcpu(CPUState *cs)
>  }
>  }
>  
> +/*
> + * If no user-specified TSC frequency is present, we will try to
> + * set env->tsc_khz to the value used by KVM.
> + */

If you send a new version of this series, please to describe
"why", not "what". We can see in the code that we are setting
env->tsc to the value used by KVM, but the comment need to
explain why.

> +if (!env->tsc_khz) {
> +r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
> +kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
> +if (r > 0) {
> +env->tsc_khz = r;
> +}
> +}
> +
>  if (has_xsave) {
>  env->kvm_xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
>  }
> -- 
> 2.4.8
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 3/3] target-i386: add support to migrate vcpu's TSC rate

2015-11-17 Thread Eduardo Habkost
Hi,

On Tue, Nov 17, 2015 at 01:20:39PM +0800, Haozhong Zhang wrote:
> This patch enables migrating vcpu's TSC rate. If KVM on the destination
> machine supports TSC scaling, guest programs will observe a consistent
> TSC rate across the migration.
> 
> If TSC scaling is not supported on the destination machine, the
> migration will not be aborted and QEMU on the destination will not set
> vcpu's TSC rate to the migrated value.
> 
> If vcpu's TSC rate specified by CPU option 'tsc-freq' on the destination
> machine is inconsistent with the migrated TSC rate, the migration will
> be aborted.
> 
> For backwards compatibility, the migration of vcpu's TSC rate is
> disabled on pc-*-2.4 and older machine types.
> 
> Signed-off-by: Haozhong Zhang 

Now the logic in this patch (and the rest of the series) looks
good to me. All my suggestions are only related to code comments
and error handling:

[...]
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 6a1acb4..6856899 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -2384,6 +2384,10 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
>  }
>  }
>  
> +if (level == KVM_PUT_FULL_STATE) {
> +kvm_arch_set_tsc_khz(cpu);

Please add a comment here indicating that errors are being
ignored, and explaining why.

> +}
> +
>  ret = kvm_getput_regs(x86_cpu, 1);
>  if (ret < 0) {
>  return ret;
> diff --git a/target-i386/machine.c b/target-i386/machine.c
> index a18e16e..3c5d24b 100644
> --- a/target-i386/machine.c
> +++ b/target-i386/machine.c
> @@ -331,6 +331,13 @@ static int cpu_post_load(void *opaque, int version_id)
>  CPUX86State *env = >env;
>  int i;
>  
> +if (env->tsc_khz && env->user_tsc_khz &&
> +env->tsc_khz != env->user_tsc_khz) {
> +fprintf(stderr, "Mismatch between user-specified TSC frequency and "
> +"migrated TSC frequency\n");

Please use error_report() instead of fprintf().

> +return -1;

Please return a valid -errno value. Other post_load functions
that implement sanity checks use -EINVAL (e.g.
global_state_post_load(), configuration_post_load()), so that's
probably what we should do here.

> +}
> +
>  /*
>   * Real mode guest segments register DPL should be zero.
>   * Older KVM version were setting it wrongly.
> @@ -775,6 +782,26 @@ static const VMStateDescription vmstate_xss = {
>  }
>  };
>  
> +static bool tsc_khz_needed(void *opaque)
> +{
> +X86CPU *cpu = opaque;
> +CPUX86State *env = >env;
> +MachineClass *mc = MACHINE_GET_CLASS(qdev_get_machine());
> +PCMachineClass *pcmc = PC_MACHINE_CLASS(mc);
> +return env->tsc_khz && pcmc->save_tsc_khz;
> +}
> +
> +static const VMStateDescription vmstate_tsc_khz = {
> +.name = "cpu/tsc_khz",
> +.version_id = 1,
> +.minimum_version_id = 1,
> +.needed = tsc_khz_needed,
> +.fields = (VMStateField[]) {
> +VMSTATE_INT64(env.tsc_khz, X86CPU),
> +VMSTATE_END_OF_LIST()
> +}
> +};
> +
>  VMStateDescription vmstate_x86_cpu = {
>  .name = "cpu",
>  .version_id = 12,
> @@ -895,6 +922,7 @@ VMStateDescription vmstate_x86_cpu = {
>  _msr_hyperv_runtime,
>  _avx512,
>  _xss,
> +_tsc_khz,
>  NULL
>  }
>  };
> -- 
> 2.4.8
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 2/3] target-i386: reorganize TSC rate setting code

2015-11-17 Thread Eduardo Habkost
On Tue, Nov 17, 2015 at 10:07:53PM +0800, Haozhong Zhang wrote:
> On 11/17/15 11:32, Eduardo Habkost wrote:
> > On Tue, Nov 17, 2015 at 01:20:38PM +0800, Haozhong Zhang wrote:
> > > Following two changes are made to the TSC rate setting code in
> > > kvm_arch_init_vcpu():
> > >  * The code is moved to a new function kvm_arch_set_tsc_khz().
> > >  * If setting user-specified TSC rate fails and the host TSC rate is
> > >inconsistent with the user-specified one, print a warning message.
> > > 
> > > Signed-off-by: Haozhong Zhang <haozhong.zh...@intel.com>
> > 
> > This matches what I was expecting, and now I see that we don't
> > even need the user_tsc_khz field.
> >
> 
> I guess you mean the user_tsc_khz field is not needed when setting TSC
> rate. It's still needed in patch 3 to check if the migrated TSC rate
> is consistent with the user-specified TSC rate (and of course it's not
> in kvm_arch_set_tsc_khz()).

Yes, I was looking only at the error-checking logic in
kvm_arch_init_vcpu(). Then I noticed user_tsc_khz was added for
the migration sanity check (which makes sense).

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 2/2] target-i386: add support to migrate vcpu's TSC rate

2015-11-16 Thread Eduardo Habkost
On Mon, Nov 16, 2015 at 10:30:08PM +0800, Haozhong Zhang wrote:
> On 11/16/15 11:43, Eduardo Habkost wrote:
> > On Mon, Nov 16, 2015 at 04:04:08PM +0800, Haozhong Zhang wrote:
> > > This patch enables migrating vcpu's TSC rate. If KVM on the destination
> > > machine supports TSC scaling, guest programs will observe a consistent
> > > TSC rate across the migration.
> > > 
> > > If TSC scaling is not supported on the destination machine, the
> > > migration will not be aborted and QEMU on the destination will not set
> > > vcpu's TSC rate to the migrated value.
> > > 
> > > If vcpu's TSC rate specified by CPU option 'tsc-freq' on the destination
> > > machine is inconsistent with the migrated TSC rate, the migration will
> > > be aborted.
> > > 
> > > For backwards compatibility, the migration of vcpu's TSC rate is
> > > disabled on pc-*-2.4 and older machine types.
> > > 
> > > Signed-off-by: Haozhong Zhang <haozhong.zh...@intel.com>
[...]
> > > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > > index 9084b29..8448248 100644
> > > --- a/target-i386/kvm.c
> > > +++ b/target-i386/kvm.c
> > > @@ -2334,8 +2334,30 @@ static void kvm_arch_set_tsc_khz(CPUState *cs)
> > >  int r;
> > >  
> > >  /*
> > > - * If no user-specified TSC frequency is present, we will try to
> > > - * set env->tsc_khz to the value used by KVM.
> > > + * For other cases of env->tsc_khz and env->user_tsc_khz:
> > > + *
> > > + * - We have eliminated all cases that satisfy
> > > + *   env->tsc_khz && env->user_tsc_khz &&
> > > + *   env->tsc_khz != env->user_tsc_khz
> > > + *   in cpu_post_load().
> > > + *
> > > + * - If env->user_tsc_khz is not zero, then it must be equal to
> > > + *   env->tsc_khz (if the latter is not zero) and has been set in
> > > + *   kvm_arch_init_vcpu().
> > > + */
> > > +if (env->tsc_khz && !env->user_tsc_khz) {
> > > +r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ?
> > > +kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) : -ENOTSUP;
> > 
> > Please don't duplicate the code from kvm_arch_init_vcpu(). We can
> > handle both cases in kvm_arch_put_registers(KVM_PUT_FULL_STATE),
> > can't we?
> >
> 
> No. If KVM_SET_TSC_KHZ fails in kvm_arch_init_vcpu(), QEMU will
> exit. But if KVM_SET_TSC_KHZ fails in the migration, QEMU will not
> abort. And because the return value of
> kvm_arch_put_registers(KVM_PUT_FULL_STATE) is not checked by its
> caller do_kvm_cpu_synchronize_post_init(), I have to handle them in
> different ways.

Reporting errors back in kvm_put_registers() may be difficult, I
see, so handling user-provided TSC frequency in
kvm_arch_init_vcpu() makes sense. But you can still avoid code
duplication. Just reuse the same function in kvm_arch_init_vcpu()
and kvm_put_registers(), but return errors back to the caller in
kvm_arch_init_vcpu() in case env->user_tsc_khz is set.

kvm_put_registers() can ignore the error, and just print a
warning. But (on both cases) we should print a warning only if
env->tsc_khz doesn't match KVM_GET_TSC_KHZ, because we don't want
to print spurious warnings on every migration when TSC scaling
isn't supported. (You even suggested changes to the example code
that does that, at Message-ID:
<20151106023244.gb24...@hzzhang-optiplex-9020.sh.intel.com>).

Also, I believe it won't be a problem if we call KVM_SET_TSC_KHZ
twice in the case of incoming migration, so there's no need to
check user_tsc_khz in the kvm_arch_put_registers() path. Keeping
the code simple is more important than avoiding one extra ioctl()
on incoming migration, IMO.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 2/2] target-i386: add support to migrate vcpu's TSC rate

2015-11-16 Thread Eduardo Habkost
On Mon, Nov 16, 2015 at 04:04:08PM +0800, Haozhong Zhang wrote:
> This patch enables migrating vcpu's TSC rate. If KVM on the destination
> machine supports TSC scaling, guest programs will observe a consistent
> TSC rate across the migration.
> 
> If TSC scaling is not supported on the destination machine, the
> migration will not be aborted and QEMU on the destination will not set
> vcpu's TSC rate to the migrated value.
> 
> If vcpu's TSC rate specified by CPU option 'tsc-freq' on the destination
> machine is inconsistent with the migrated TSC rate, the migration will
> be aborted.
> 
> For backwards compatibility, the migration of vcpu's TSC rate is
> disabled on pc-*-2.4 and older machine types.
> 
> Signed-off-by: Haozhong Zhang 
> ---
>  hw/i386/pc.c  |  1 +
>  hw/i386/pc_piix.c |  1 +
>  hw/i386/pc_q35.c  |  1 +
>  include/hw/i386/pc.h  |  1 +
>  target-i386/cpu.c |  2 +-
>  target-i386/cpu.h |  1 +
>  target-i386/kvm.c | 26 --
>  target-i386/machine.c | 28 
>  8 files changed, 58 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 0cb8afd..2f2fc93 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1952,6 +1952,7 @@ static void pc_machine_class_init(ObjectClass *oc, void 
> *data)
>  HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
>  
>  pcmc->get_hotplug_handler = mc->get_hotplug_handler;
> +pcmc->save_tsc_khz = true;
>  mc->get_hotplug_handler = pc_get_hotpug_handler;
>  mc->cpu_index_to_socket_id = pc_cpu_index_to_socket_id;
>  mc->default_boot_order = "cad";
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 07d0baa..7c5b0d2 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -489,6 +489,7 @@ static void pc_i440fx_2_4_machine_options(MachineClass *m)
>  m->alias = NULL;
>  m->is_default = 0;
>  pcmc->broken_reserved_end = true;
> +pcmc->save_tsc_khz = false;
>  SET_MACHINE_COMPAT(m, PC_COMPAT_2_4);
>  }
>  
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 0fdae09..fd8efe3 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -387,6 +387,7 @@ static void pc_q35_2_4_machine_options(MachineClass *m)
>  m->hw_version = "2.4.0";
>  m->alias = NULL;
>  pcmc->broken_reserved_end = true;
> +pcmc->save_tsc_khz = false;
>  SET_MACHINE_COMPAT(m, PC_COMPAT_2_4);
>  }
>  
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 4bbc0ff..fea0f28 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -60,6 +60,7 @@ struct PCMachineClass {
>  
>  /*< public >*/
>  bool broken_reserved_end;
> +bool save_tsc_khz;
>  HotplugHandler *(*get_hotplug_handler)(MachineState *machine,
> DeviceState *dev);
>  };
> diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> index e5f1c5b..98c6a4c 100644
> --- a/target-i386/cpu.c
> +++ b/target-i386/cpu.c
> @@ -1724,7 +1724,7 @@ static void x86_cpuid_set_tsc_freq(Object *obj, Visitor 
> *v, void *opaque,
>  return;
>  }
>  
> -cpu->env.tsc_khz = value / 1000;
> +cpu->env.tsc_khz = cpu->env.user_tsc_khz = value / 1000;
>  }
>  
>  static void x86_cpuid_get_apic_id(Object *obj, Visitor *v, void *opaque,
> diff --git a/target-i386/cpu.h b/target-i386/cpu.h
> index fc4a605..1ad1da8 100644
> --- a/target-i386/cpu.h
> +++ b/target-i386/cpu.h
> @@ -973,6 +973,7 @@ typedef struct CPUX86State {
>  uint32_t sipi_vector;
>  bool tsc_valid;
>  int64_t tsc_khz;
> +int64_t user_tsc_khz;
>  void *kvm_xsave_buf;
>  
>  uint64_t mcg_cap;
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 9084b29..8448248 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -2334,8 +2334,30 @@ static void kvm_arch_set_tsc_khz(CPUState *cs)
>  int r;
>  
>  /*
> - * If no user-specified TSC frequency is present, we will try to
> - * set env->tsc_khz to the value used by KVM.
> + * For other cases of env->tsc_khz and env->user_tsc_khz:
> + *
> + * - We have eliminated all cases that satisfy
> + *   env->tsc_khz && env->user_tsc_khz &&
> + *   env->tsc_khz != env->user_tsc_khz
> + *   in cpu_post_load().
> + *
> + * - If env->user_tsc_khz is not zero, then it must be equal to
> + *   env->tsc_khz (if the latter is not zero) and has been set in
> + *   kvm_arch_init_vcpu().
> + */
> +if (env->tsc_khz && !env->user_tsc_khz) {
> +r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ?
> +kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) : -ENOTSUP;

Please don't duplicate the code from kvm_arch_init_vcpu(). We can
handle both cases in kvm_arch_put_registers(KVM_PUT_FULL_STATE),
can't we?

> +if (r < 0) {
> +error_report("warning: TSC frequency mismatch between VM and 
> host, "
> + "and 

Re: [PATCH v4 1/2] target-i386: fallback vcpu's TSC rate to value returned by KVM

2015-11-16 Thread Eduardo Habkost
On Mon, Nov 16, 2015 at 04:04:07PM +0800, Haozhong Zhang wrote:
> If no user-specified TSC rate is present, we will try to set
> env->tsc_khz to the value returned by KVM_GET_TSC_KHZ.
> 
> Signed-off-by: Haozhong Zhang 
> ---
>  target-i386/kvm.c | 25 +
>  1 file changed, 25 insertions(+)
> 
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 2a9953b..9084b29 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -2327,6 +2327,27 @@ static int kvm_get_debugregs(X86CPU *cpu)
>  return 0;
>  }
>  
> +static void kvm_arch_set_tsc_khz(CPUState *cs)

> +{
> +X86CPU *cpu = X86_CPU(cs);
> +CPUX86State *env = >env;
> +int r;
> +
> +/*
> + * If no user-specified TSC frequency is present, we will try to
> + * set env->tsc_khz to the value used by KVM.
> + */
> +if (!env->tsc_khz) {
> +r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
> +kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;

Can't we do this on kvm_arch_init_vcpu()? kvm_arch_put_registers()'s purpose is
to just copy data from QEMU to KVM, not the other way around.


> +if (r < 0) {
> +error_report("warning: KVM_GET_TSC_KHZ failed");

Having a kernel that doesn't support KVM_CAP_GET_TSC_KHZ shouldn't trigger a
warning every time we run QEMU, unless the user is explicitly asking for a
feature that requires KVM_GET_TSC_KHZ.

> +} else {
> +env->tsc_khz = r;
> +}
> +}
> +}
> +
>  int kvm_arch_put_registers(CPUState *cpu, int level)
>  {
>  X86CPU *x86_cpu = X86_CPU(cpu);
> @@ -2341,6 +2362,10 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
>  }
>  }
>  
> +if (level == KVM_PUT_FULL_STATE) {
> +kvm_arch_set_tsc_khz(cpu);
> +}

I see that kvm_arch_set_tsc_khz() will be extended to call
KVM_SET_TSC_KHZ in the next patch, so the kvm_arch_set_tsc_khz()
seems to belong here. But the KVM_GET_TSC_KHZ call doesn't seem
to belong in kvm_arch_set_tsc_khz()

kvm_arch_put_registers() callers don't expect any QEMU-side data
to change, but just that KVM data is updated according to the
QEMU-side data.

> +
>  ret = kvm_getput_regs(x86_cpu, 1);
>  if (ret < 0) {
>  return ret;
> -- 
> 2.4.8
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v3 1/3] target-i386: add a subsection for migrating vcpu's TSC rate

2015-11-13 Thread Eduardo Habkost
On Fri, Nov 13, 2015 at 10:23:54AM +0800, Haozhong Zhang wrote:
> On 11/11/15 22:27, Haozhong Zhang wrote:
> > On 11/11/15 12:16, Eduardo Habkost wrote:
> [...]
> > > > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> > > > index 2f8f396..858ed69 100644
> > > > --- a/hw/i386/pc_q35.c
> > > > +++ b/hw/i386/pc_q35.c
> > > > @@ -385,6 +385,7 @@ static void pc_q35_2_4_machine_options(MachineClass 
> > > > *m)
> > > >  pc_q35_2_5_machine_options(m);
> > > >  m->alias = NULL;
> > > >  pcmc->broken_reserved_end = true;
> > > > +pcmc->save_tsc_khz = false;
> > > 
> > > I had suggested the PCMachineClass field, but now I've been thinking:
> > > all other fields related to tsc_khz are in X86CPU, so I believe this
> > > belongs to X86CPU too. It could be a simple X86CPU property set by
> > > PC_COMPAT_2_4.
> > >
> > 
> > Reasonable, will update in the next version.
> 
> Or maybe no ...
> 
> I think there is still a problem to set a X86CPU property in
> PC_COMPAT_2_4:
> 
> if I create a property for save_tsc_khz by adding
>   DEFINE_PROP_BOOL("save-tsc-freq", X86CPU, save_tsc_khz, true)
> in x86_cpu_properties and add
>   {
>   .driver   = TYPE_X86_CPU,
>   .property = "save-tsc-freq",
>   .value= "off",
>   }
> in PC_COMPAT_2_4, then "save-tsc-freq" will also become a
> user-visible cpu option. But we agreed on keeping it as an
> internal flag in the previous discussion.
> 
> Any other ways to set a property in PC_COMPAT_* while keeping that
> property internal?

I don't think making it internal is a requirement. It just make
things simpler because it allowed us to postpone decisions about
the user-visible parts.

...which seems to be a good reason to keep it on PCMachineClass
by now, if you prefer it that way. The subsection code is already
on machine.c and not on cpu.c, anyway.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/3] target-i386: calculate vcpu's TSC rate to be migrated

2015-11-11 Thread Eduardo Habkost
On Wed, Nov 11, 2015 at 12:57:44AM +0800, Haozhong Zhang wrote:
> On 11/09/15 14:01, Eduardo Habkost wrote:
> > On Mon, Nov 09, 2015 at 08:33:55AM +0800, haozhong.zh...@intel.com wrote:
> > > On 11/06/15 13:12, Eduardo Habkost wrote:
> > > > On Fri, Nov 06, 2015 at 10:32:24AM +0800, haozhong.zh...@intel.com 
> > > > wrote:
> > > > > On 11/05/15 14:05, Eduardo Habkost wrote:
> > > > > > On Thu, Nov 05, 2015 at 09:30:51AM +0800, Haozhong Zhang wrote:
> > > > > > > On 11/04/15 19:42, Eduardo Habkost wrote:
> > > > [...]
> > > > > > > > > +env->tsc_khz_saved = r;
> > > > > > > > > +}
> > > > > > > > 
> > > > > > > > Why do you need a separate tsc_khz_saved field, and don't 
> > > > > > > > simply use
> > > > > > > > tsc_khz? It would have the additional feature of letting QMP 
> > > > > > > > clients
> > > > > > > > query the current TSC rate by asking for the tsc-freq property 
> > > > > > > > on CPU
> > > > > > > > objects.
> > > > > > > >
> > > > > > > 
> > > > > > > It's to avoid overriding env->tsc_khz on the destination in the
> > > > > > > migration. I can change this line to
> > > > > > >  env->tsc_khz = env->tsc_khz_saved = r;
> > > > > > 
> > > > > > You are already avoiding overriding env->tsc_khz, because you use
> > > > > > KVM_GET_TSC_KHZ only if tsc_khz is not set yet. I still don't see 
> > > > > > why
> > > > > > you need a tsc_khz_saved field that requires duplicating the 
> > > > > > SET_TSC_KHZ
> > > > > > code, if you could just do this:
> > > > > > 
> > > > > > if (!env->tsc_khz) {
> > > > > > env->tsc_khz = kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ);
> > > > > > }
> > > > > >
> > > > > 
> > > > > Consider an example that we migrate a VM from machine A to machine B
> > > > > and then to machine C, and QEMU on machine B is launched with the cpu
> > > > > option 'tsc-freq' (i.e. env->tsc_khz on B is non-zero at the
> > > > > beginning):
> > > > >  1) In the migration from B to C, the user-specified TSC frequency by
> > > > > 'tsc-freq' on B is expected to be migrated to C. That is, the
> > > > > value of env->tsc_khz on B is migrated.
> > > > >  2) If TSC frequency is migrated through env->tsc_khz, then
> > > > > env->tsc_khz on B will be overrode in the migration from A to B
> > > > > before kvm_arch_setup_tsc_khz(). If the guest TSC frequency is
> > > > > different than the user-specified TSC frequency on B, the
> > > > > expectation in 1) will not be satisfied anymore.
> > > > 
> > > > Setting tsc-freq on B when tsc-freq was not used on A is invalid usage.
> > > > This is not different from changing the CPU model and adding or removing
> > > > CPU flags when migrating, which is also incorrect. The command-line
> > > > parameters defining the VM must be the same when you migrate.
> > > >
> > > 
> > > Good to know it's an invalid usage. Then the question is what QEMU is
> > > expected to do for this invalid usage?
> > > 
> > >  1) Abort the migration? But I find that the current QEMU does not
> > > abort the migration between different CPU models (e.g. Nehalem and
> > > Haswell).
> > > 
> > >  2) Or do not abort the migration and ignore tsc-freq option? If so,
> > > tsc_khz_saved will be not needed.
> > 
> > My first choice is to abort migration. If we decide to abort today and
> > find it to cause problems, we can easily fix it. If we decide to
> > continue without aborting, it is difficult to change that behavior
> > without breaking existing setups.
> >
> 
> Two additional questions:
> 
>  1) Existing QEMU allows 'tsc-freq' on the destination in the
> migration. If we decided to abort when both 'tsc-freq' and
> migrated TSC were present on the destination, it would break some
> existing usages. Considering backward compatibility, would above
> choice 2) be better?

We shouldn't abort simply because the section is present and tsc-freq is
set (because we will always send the section in the newer
machine-types). We should abort only when we know that the command-line
contradicts what we see in the migration stream.

> 
>  2) If we do decide to abort, could I use abort()? Or are there other
> clean approaches to abort?

You don't need to abort QEMU. You just need to tell the migration code
that migration can't continue. The exact way to do it depends on where
you are hooking the sanity check code. If you use a
VMStateDescription.post_load hook, you can use error_report() and return
a negative errno value. CCing Quintela in case he has suggestions.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/3] target-i386: calculate vcpu's TSC rate to be migrated

2015-11-11 Thread Eduardo Habkost
On Tue, Nov 10, 2015 at 09:08:58AM +0800, Haozhong Zhang wrote:
> On 11/09/15 14:01, Eduardo Habkost wrote:
> > On Mon, Nov 09, 2015 at 08:33:55AM +0800, haozhong.zh...@intel.com wrote:
> > > On 11/06/15 13:12, Eduardo Habkost wrote:
> > > > On Fri, Nov 06, 2015 at 10:32:24AM +0800, haozhong.zh...@intel.com 
> > > > wrote:
> > > > > On 11/05/15 14:05, Eduardo Habkost wrote:
> > > > > > On Thu, Nov 05, 2015 at 09:30:51AM +0800, Haozhong Zhang wrote:
> > > > > > > On 11/04/15 19:42, Eduardo Habkost wrote:
> > > > [...]
> > > > > > > > > +env->tsc_khz_saved = r;
> > > > > > > > > +}
> > > > > > > > 
> > > > > > > > Why do you need a separate tsc_khz_saved field, and don't 
> > > > > > > > simply use
> > > > > > > > tsc_khz? It would have the additional feature of letting QMP 
> > > > > > > > clients
> > > > > > > > query the current TSC rate by asking for the tsc-freq property 
> > > > > > > > on CPU
> > > > > > > > objects.
> > > > > > > >
> > > > > > > 
> > > > > > > It's to avoid overriding env->tsc_khz on the destination in the
> > > > > > > migration. I can change this line to
> > > > > > >  env->tsc_khz = env->tsc_khz_saved = r;
> > > > > > 
> > > > > > You are already avoiding overriding env->tsc_khz, because you use
> > > > > > KVM_GET_TSC_KHZ only if tsc_khz is not set yet. I still don't see 
> > > > > > why
> > > > > > you need a tsc_khz_saved field that requires duplicating the 
> > > > > > SET_TSC_KHZ
> > > > > > code, if you could just do this:
> > > > > > 
> > > > > > if (!env->tsc_khz) {
> > > > > > env->tsc_khz = kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ);
> > > > > > }
> > > > > >
> > > > > 
> > > > > Consider an example that we migrate a VM from machine A to machine B
> > > > > and then to machine C, and QEMU on machine B is launched with the cpu
> > > > > option 'tsc-freq' (i.e. env->tsc_khz on B is non-zero at the
> > > > > beginning):
> > > > >  1) In the migration from B to C, the user-specified TSC frequency by
> > > > > 'tsc-freq' on B is expected to be migrated to C. That is, the
> > > > > value of env->tsc_khz on B is migrated.
> > > > >  2) If TSC frequency is migrated through env->tsc_khz, then
> > > > > env->tsc_khz on B will be overrode in the migration from A to B
> > > > > before kvm_arch_setup_tsc_khz(). If the guest TSC frequency is
> > > > > different than the user-specified TSC frequency on B, the
> > > > > expectation in 1) will not be satisfied anymore.
> > > > 
> > > > Setting tsc-freq on B when tsc-freq was not used on A is invalid usage.
> > > > This is not different from changing the CPU model and adding or removing
> > > > CPU flags when migrating, which is also incorrect. The command-line
> > > > parameters defining the VM must be the same when you migrate.
> > > >
> > > 
> > > Good to know it's an invalid usage. Then the question is what QEMU is
> > > expected to do for this invalid usage?
> > > 
> > >  1) Abort the migration? But I find that the current QEMU does not
> > > abort the migration between different CPU models (e.g. Nehalem and
> > > Haswell).
> > > 
> > >  2) Or do not abort the migration and ignore tsc-freq option? If so,
> > > tsc_khz_saved will be not needed.
> > 
> > My first choice is to abort migration. If we decide to abort today and
> > find it to cause problems, we can easily fix it. If we decide to
> > continue without aborting, it is difficult to change that behavior
> > without breaking existing setups.
> 
> Agree, but I would like to relax the abort condition to "abort the
> migration only if QEMU on the destination uses a different TSC
> frequency than the migrated one" so that the following usages would be
> still valid:
>  1) Only QEMU on the destination has 'tsc-freq' option, but it' set to
> the same value of the migrated one.
>  2) Only QEMU on the so

Re: [Qemu-devel] [PATCH v3 1/3] target-i386: add a subsection for migrating vcpu's TSC rate

2015-11-11 Thread Eduardo Habkost
On Mon, Nov 02, 2015 at 05:26:41PM +0800, Haozhong Zhang wrote:
> A new subsection 'vmstate_tsc_khz' is added to migrate vcpu's TSC
> rate. For the backwards compatibility, this subsection is not migrated
> on pc-*-2.4 and older machine types.
> 
> Signed-off-by: Haozhong Zhang 
> ---
>  hw/i386/pc.c  |  1 +
>  hw/i386/pc_piix.c |  1 +
>  hw/i386/pc_q35.c  |  1 +
>  include/hw/i386/pc.h  |  1 +
>  target-i386/cpu.h |  1 +
>  target-i386/machine.c | 21 +
>  6 files changed, 26 insertions(+)
> 
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 0cb8afd..2f2fc93 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -1952,6 +1952,7 @@ static void pc_machine_class_init(ObjectClass *oc, void 
> *data)
>  HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
>  
>  pcmc->get_hotplug_handler = mc->get_hotplug_handler;
> +pcmc->save_tsc_khz = true;
>  mc->get_hotplug_handler = pc_get_hotpug_handler;
>  mc->cpu_index_to_socket_id = pc_cpu_index_to_socket_id;
>  mc->default_boot_order = "cad";
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 393dcc4..fc71321 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -487,6 +487,7 @@ static void pc_i440fx_2_4_machine_options(MachineClass *m)
>  m->alias = NULL;
>  m->is_default = 0;
>  pcmc->broken_reserved_end = true;
> +pcmc->save_tsc_khz = false;
>  SET_MACHINE_COMPAT(m, PC_COMPAT_2_4);
>  }
>  
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 2f8f396..858ed69 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -385,6 +385,7 @@ static void pc_q35_2_4_machine_options(MachineClass *m)
>  pc_q35_2_5_machine_options(m);
>  m->alias = NULL;
>  pcmc->broken_reserved_end = true;
> +pcmc->save_tsc_khz = false;

I had suggested the PCMachineClass field, but now I've been thinking:
all other fields related to tsc_khz are in X86CPU, so I believe this
belongs to X86CPU too. It could be a simple X86CPU property set by
PC_COMPAT_2_4.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 07/35] util: introduce qemu_file_get_page_size()

2015-11-09 Thread Eduardo Habkost
On Mon, Nov 09, 2015 at 12:36:36PM +0800, Xiao Guangrong wrote:
> On 11/06/2015 11:36 PM, Eduardo Habkost wrote:
> >On Mon, Nov 02, 2015 at 05:13:09PM +0800, Xiao Guangrong wrote:
> >>There are three places use the some logic to get the page size on
> >>the file path or file fd
> >>
> >>Windows did not support file hugepage, so it will return normal page
> >>for this case. And this interface has not been used on windows so far
> >>
> >>This patch introduces qemu_file_get_page_size() to unify the code
> >>
> >>Signed-off-by: Xiao Guangrong <guangrong.x...@linux.intel.com>
> >[...]
> >>diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> >>index 914cef5..51437ff 100644
> >>--- a/util/oslib-posix.c
> >>+++ b/util/oslib-posix.c
> >>@@ -340,7 +340,7 @@ static void sigbus_handler(int signal)
> >>  siglongjmp(sigjump, 1);
> >>  }
> >>
> >>-static size_t fd_getpagesize(int fd)
> >>+static size_t fd_getpagesize(int fd, Error **errp)
> >>  {
> >>  #ifdef CONFIG_LINUX
> >>  struct statfs fs;
> >>@@ -351,7 +351,12 @@ static size_t fd_getpagesize(int fd)
> >>  ret = fstatfs(fd, );
> >>  } while (ret != 0 && errno == EINTR);
> >>
> >>-if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) {
> >>+if (ret) {
> >>+error_setg_errno(errp, errno, "fstatfs is failed");
> >>+return 0;
> >>+}
> >>+
> >>+if (fs.f_type == HUGETLBFS_MAGIC) {
> >>  return fs.f_bsize;
> >>  }
> >
> >You are changing os_mem_prealloc() behavior when fstatfs() fails, here.
> >Have you ensured there are no cases where fstatfs() fails but this code
> >is still expected to work?
> 
> stat() is supported for all kinds of files, so failed stat() is caused by
> file is not exist or kernel internal error (e,g memory is not enough) or
> security check is not passed. Whichever we should not do any operation on
> the file if stat() failed. The origin code did not check it but it is worth
> being fixed i think.

Note that this is fstatfs(), not stat(). It's possible go get
ENOSYS as error from statfs() if it is not implemented by the
filesystem, I just don't know if this really can happen in
practice.

(But the answer won't matter, as we already aborted on statfs()
errors on all codepaths that call fd_getpagesize(). See below.)

> 
> >
> >The change looks safe: gethugepagesize() seems to be always called in
> >the path that would make fd_getpagesize() be called from
> >os_mem_prealloc(), so allocation would abort much earlier if statfs()
> >failed. But I haven't confirmed that yet, and I wanted to be sure.
> >
> 
> Yes, I am entirely agree with you. :)
> 

I have just confirmed that this is the case, as:

* fd_getpagesize() is only called from os_mem_prealloc(),
* os_mem_prealloc() is called from:
  * host_memory_backend_set_prealloc()
* Using memory_region_get_fd() as the fd argument
  * host_memory_backend_memory_complete()
* Using memory_region_get_fd() as the fd argument
  * file_ram_alloc()
* After qemu_file_get_page_size()/gethugepagesize() was already called
  in the same fd (with errors checked)
* fd_getpagesize() checks for fd == -1
* The only code that sets the fd for a RAMBlock is file_ram_alloc(),
  which checks for qemu_file_get_page_size()/gethugepagesize()
  errors

So, it was already impossible to get os_mem_prealloc() called
with fd != -1 without having gethugepagesize() called first (and
gethugepagesize() already checked for statfs() errors).

Reviewed-by: Eduardo Habkost <ehabk...@redhat.com>

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 12/35] util: let qemu_fd_getlength support block device

2015-11-09 Thread Eduardo Habkost
On Mon, Nov 09, 2015 at 01:58:27PM +0800, Xiao Guangrong wrote:
> 
> 
> On 11/06/2015 11:54 PM, Eduardo Habkost wrote:
> >On Mon, Nov 02, 2015 at 05:13:14PM +0800, Xiao Guangrong wrote:
> >>lseek can not work for all block devices as the man page says:
> >>| Some devices are incapable of seeking and POSIX does not specify
> >>| which devices must support lseek().
> >>
> >>This patch tries to add the support on Linux by using BLKGETSIZE64
> >>ioctl
> >>
> >>Signed-off-by: Xiao Guangrong <guangrong.x...@linux.intel.com>
> >
> >On which cases is this patch necessary? Do you know any examples of
> >Linux block devices that won't work with lseek(SEEK_END)?
> 
> To be honest, i have not checked all block device, this patch was made
> based on the man page. However, i do not mind drop this patch (and maybe
> other patches) to make this pachset smaller. BLKGETSIZE64 can be added
> in the future if we meet such device.

By looking at the Linux source code implementing BLKGETSIZE64, it looks
like it should work for all block devices where SEEK_END works:

* BLKGETSIZE64 returns i_size_read(bdev->bd_inode)
  (block/ioctl.c:blkdev_ioctl())
* llseek(SEEK_END) uses i_size_read(bd_inode) as the offset
  (fs/block_dev.c:block_llseek())

That's probably why raw_getlength() never needed a Linux-specific
BLKGETSIZE call.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] qemu, pkeys: add pkeys support for qemu xsave state handling

2015-11-09 Thread Eduardo Habkost
On Mon, Nov 09, 2015 at 07:55:33PM +0800, Huaitong Han wrote:
> This patch adds pkeys support for qemu xsave state handling.
> 
> Signed-off-by: Huaitong Han 
[...]
> @@ -1145,6 +1146,7 @@ static int kvm_put_xsave(X86CPU *cpu)
>  #ifdef TARGET_X86_64
>  memcpy(>region[XSAVE_Hi16_ZMM], >xmm_regs[16],
>  16 * sizeof env->xmm_regs[16]);
> +memcpy(>region[XSAVE_PKRU], >pkru, sizeof env->pkru);
>  #endif
>  r = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
>  return r;
> @@ -1516,6 +1518,7 @@ static int kvm_get_xsave(X86CPU *cpu)
>  #ifdef TARGET_X86_64
>  memcpy(>xmm_regs[16], >region[XSAVE_Hi16_ZMM],
> 16 * sizeof env->xmm_regs[16]);
> +memcpy(>region[XSAVE_PKRU], >pkru, sizeof env->pkru);

Did you mean:
  memcpy(>pkru, >region[XSAVE_PKRU], sizeof env->pkru)

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/3] target-i386: calculate vcpu's TSC rate to be migrated

2015-11-09 Thread Eduardo Habkost
On Mon, Nov 09, 2015 at 08:33:55AM +0800, haozhong.zh...@intel.com wrote:
> On 11/06/15 13:12, Eduardo Habkost wrote:
> > On Fri, Nov 06, 2015 at 10:32:24AM +0800, haozhong.zh...@intel.com wrote:
> > > On 11/05/15 14:05, Eduardo Habkost wrote:
> > > > On Thu, Nov 05, 2015 at 09:30:51AM +0800, Haozhong Zhang wrote:
> > > > > On 11/04/15 19:42, Eduardo Habkost wrote:
> > [...]
> > > > > > > +env->tsc_khz_saved = r;
> > > > > > > +}
> > > > > > 
> > > > > > Why do you need a separate tsc_khz_saved field, and don't simply use
> > > > > > tsc_khz? It would have the additional feature of letting QMP clients
> > > > > > query the current TSC rate by asking for the tsc-freq property on 
> > > > > > CPU
> > > > > > objects.
> > > > > >
> > > > > 
> > > > > It's to avoid overriding env->tsc_khz on the destination in the
> > > > > migration. I can change this line to
> > > > >  env->tsc_khz = env->tsc_khz_saved = r;
> > > > 
> > > > You are already avoiding overriding env->tsc_khz, because you use
> > > > KVM_GET_TSC_KHZ only if tsc_khz is not set yet. I still don't see why
> > > > you need a tsc_khz_saved field that requires duplicating the SET_TSC_KHZ
> > > > code, if you could just do this:
> > > > 
> > > > if (!env->tsc_khz) {
> > > > env->tsc_khz = kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ);
> > > > }
> > > >
> > > 
> > > Consider an example that we migrate a VM from machine A to machine B
> > > and then to machine C, and QEMU on machine B is launched with the cpu
> > > option 'tsc-freq' (i.e. env->tsc_khz on B is non-zero at the
> > > beginning):
> > >  1) In the migration from B to C, the user-specified TSC frequency by
> > > 'tsc-freq' on B is expected to be migrated to C. That is, the
> > > value of env->tsc_khz on B is migrated.
> > >  2) If TSC frequency is migrated through env->tsc_khz, then
> > > env->tsc_khz on B will be overrode in the migration from A to B
> > > before kvm_arch_setup_tsc_khz(). If the guest TSC frequency is
> > > different than the user-specified TSC frequency on B, the
> > > expectation in 1) will not be satisfied anymore.
> > 
> > Setting tsc-freq on B when tsc-freq was not used on A is invalid usage.
> > This is not different from changing the CPU model and adding or removing
> > CPU flags when migrating, which is also incorrect. The command-line
> > parameters defining the VM must be the same when you migrate.
> >
> 
> Good to know it's an invalid usage. Then the question is what QEMU is
> expected to do for this invalid usage?
> 
>  1) Abort the migration? But I find that the current QEMU does not
> abort the migration between different CPU models (e.g. Nehalem and
> Haswell).
> 
>  2) Or do not abort the migration and ignore tsc-freq option? If so,
> tsc_khz_saved will be not needed.

My first choice is to abort migration. If we decide to abort today and
find it to cause problems, we can easily fix it. If we decide to
continue without aborting, it is difficult to change that behavior
without breaking existing setups.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 11/35] util: introduce qemu_file_getlength()

2015-11-09 Thread Eduardo Habkost
On Mon, Nov 09, 2015 at 12:44:55PM +0800, Xiao Guangrong wrote:
> On 11/06/2015 11:50 PM, Eduardo Habkost wrote:
> >As this patch affects raw_getlength(), CCing the raw block driver
> >maintainer and the qemu-block mailing list.
> 
> Eduardo, thanks for your reminder. I will keep CCing Kevin and qemu-block mail
> list for future version.
> 
> >
> >On Mon, Nov 02, 2015 at 05:13:13PM +0800, Xiao Guangrong wrote:
> >>It is used to get the size of the specified file, also qemu_fd_getlength()
> >>is introduced to unify the code with raw_getlength() in block/raw-posix.c
> >>
> >>Signed-off-by: Xiao Guangrong <guangrong.x...@linux.intel.com>
> >>---
> >>  block/raw-posix.c|  7 +--
> >>  include/qemu/osdep.h |  2 ++
> >>  util/osdep.c | 31 +++
> >
> >I know I was the one who suggested osdep.c, but maybe oslib-posix.c is a
> >more appropriate place for the new function?
> >
> 
> Since the function we introduced here can work on both windows and posix, so
> i thing osdep.c is the right place. Otherwise we should implement it for 
> multiple
> platforms.

I didn't notice it was going to be used by a platform-independent
qemu_file_getlength() function in addition to the posix-specific
raw_getlength(). Have you tested it on Windows, though?

If you didn't test it on Windows, what about keeping
qemu_file_getlength() available only on posix, by now? The only
users are raw-posix.c and hostmem-file.c, currently. If in the
future somebody need it on Windows, they can decide between
moving the SEEK_END code to osdep.c (after testing it), or moving
the existing raw-win32.c:raw_getlength() code to a
oslib-win32.c:qemu_file_getlength() function.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/3] target-i386: calculate vcpu's TSC rate to be migrated

2015-11-06 Thread Eduardo Habkost
On Fri, Nov 06, 2015 at 10:32:24AM +0800, haozhong.zh...@intel.com wrote:
> On 11/05/15 14:05, Eduardo Habkost wrote:
> > On Thu, Nov 05, 2015 at 09:30:51AM +0800, Haozhong Zhang wrote:
> > > On 11/04/15 19:42, Eduardo Habkost wrote:
[...]
> > > > > +env->tsc_khz_saved = r;
> > > > > +}
> > > > 
> > > > Why do you need a separate tsc_khz_saved field, and don't simply use
> > > > tsc_khz? It would have the additional feature of letting QMP clients
> > > > query the current TSC rate by asking for the tsc-freq property on CPU
> > > > objects.
> > > >
> > > 
> > > It's to avoid overriding env->tsc_khz on the destination in the
> > > migration. I can change this line to
> > >  env->tsc_khz = env->tsc_khz_saved = r;
> > 
> > You are already avoiding overriding env->tsc_khz, because you use
> > KVM_GET_TSC_KHZ only if tsc_khz is not set yet. I still don't see why
> > you need a tsc_khz_saved field that requires duplicating the SET_TSC_KHZ
> > code, if you could just do this:
> > 
> > if (!env->tsc_khz) {
> > env->tsc_khz = kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ);
> > }
> >
> 
> Consider an example that we migrate a VM from machine A to machine B
> and then to machine C, and QEMU on machine B is launched with the cpu
> option 'tsc-freq' (i.e. env->tsc_khz on B is non-zero at the
> beginning):
>  1) In the migration from B to C, the user-specified TSC frequency by
> 'tsc-freq' on B is expected to be migrated to C. That is, the
> value of env->tsc_khz on B is migrated.
>  2) If TSC frequency is migrated through env->tsc_khz, then
> env->tsc_khz on B will be overrode in the migration from A to B
> before kvm_arch_setup_tsc_khz(). If the guest TSC frequency is
> different than the user-specified TSC frequency on B, the
> expectation in 1) will not be satisfied anymore.

Setting tsc-freq on B when tsc-freq was not used on A is invalid usage.
This is not different from changing the CPU model and adding or removing
CPU flags when migrating, which is also incorrect. The command-line
parameters defining the VM must be the same when you migrate.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 3/3] target-i386: load the migrated vcpu's TSC rate

2015-11-06 Thread Eduardo Habkost
On Fri, Nov 06, 2015 at 10:32:44AM +0800, Haozhong Zhang wrote:
> On 11/05/15 14:10, Eduardo Habkost wrote:
> > On Mon, Nov 02, 2015 at 05:26:43PM +0800, Haozhong Zhang wrote:
> > > Set vcpu's TSC rate to the migrated value if the user does not specify a
> > > TSC rate by cpu option 'tsc-freq' and a migrated TSC rate does exist. If
> > > KVM supports TSC scaling, guest programs will observe TSC increasing in
> > > the migrated rate other than the host TSC rate.
> > > 
> > > Signed-off-by: Haozhong Zhang <haozhong.zh...@intel.com>
> > > ---
> > >  target-i386/kvm.c | 21 +
> > >  1 file changed, 21 insertions(+)
> > > 
> > > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > > index aae5e58..2be70df 100644
> > > --- a/target-i386/kvm.c
> > > +++ b/target-i386/kvm.c
> > > @@ -3042,6 +3042,27 @@ int kvm_arch_setup_tsc_khz(CPUState *cs)
> > >  int r;
> > >  
> > >  /*
> > > + * If a TSC rate is migrated and the user does not specify the
> > > + * vcpu's TSC rate on the destination, the migrated TSC rate will
> > > + * be used on the destination after the migration.
> > > + */
> > > +if (env->tsc_khz_saved && !env->tsc_khz) {
> > > +if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) {
> > > +r = kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz_saved);
> > 
> > Why are you duplicating the existing KVM_SET_TSC_KHZ code in
> > kvm_arch_init_vcpu()?
> >
> 
> Because they are called in different cases and their behaviors on
> failure are different:
>  1) KVM_SET_TSC_KHZ in kvm_arch_init_vcpu() is called only when a VM
> is created and a user-specified TSC frequency is given. If it
> fails, QEMU will abort.
>  2) KVM_SET_TSC_KHZ in kvm_arch_setup_tsc_khz() is called on the
> destination only when TSC frequency is migrated and no
> user-specified TSC frequency is given. If it fails, QEMU as well
> as the migration will not be aborted.
> 
> However, after reading your comment at the end, they really could be
> merged.

Agreed that the expected behavior ou failure is different. But it looks
like we are now on the same page about not duplicating code, with the
code you suggested below. :)

> 
> > > +if (r < 0) {
> > > +fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");
> > 
> > If you want to report errors, please use error_report().
> > 
> > (But I don't think we want to print those warnings. See below.)
> > 
> > > +}
> > > +} else {
> > > +r = -1;
> > > +fprintf(stderr, "KVM doesn't support TSC scaling\n");
> > > +}
> > > +if (r < 0) {
> > > +fprintf(stderr, "Use host TSC frequency instead. "
> > 
> > Did you mean "Using host TSC frequency instead."?
> >
> 
> Yes.
> 
> > > +"Guest TSC may be inaccurate.\n");
> > > +}
> > > +}
> > 
> > This will make QEMU print a warning every single time when migrating to
> > hosts that don't support TSC scaling, even if the source and destination
> > hosts already have the same TSC frequency. That means most users will
> > see a bogus warning, in today's hardware.
> > 
> > Maybe it will be acceptable to print a warning if (and only if) we know
> > that the host TSC is different from the original TSC frequency.
> >
> 
> Agree, I should add such a check to avoid bogus warnings.
> 
> > Considering that we already have code to handle tsc_khz that prints an
> > error, you don't need to duplicate it. You could handle both
> > user-provided and migration tsc_khz cases with the same code. With
> > something like this:
> >
> 
> Mostly, but as tsc_khz_saved in patch 2 is really needed, I'll make
> some minor changes.
> 
> - if (env->tsc_khz) { /* may be set by the user, or loaded from incoming 
> migration */
> + if (env->tsc_khz || env->tsc_khz_saved) { /* may be set by the user, or 
> loaded from incoming migration */
> + int64_t tgt_tsc_khz = env->tsc_khz ? : env->tsc_khz_saved;
> > r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ?
> - kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
> + kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, tgt_tsc_khz) :
> > -ENOTSUP;
> > if (r < 0) {
> &

Re: [PATCH v7 07/35] util: introduce qemu_file_get_page_size()

2015-11-06 Thread Eduardo Habkost
On Mon, Nov 02, 2015 at 05:13:09PM +0800, Xiao Guangrong wrote:
> There are three places use the some logic to get the page size on
> the file path or file fd
> 
> Windows did not support file hugepage, so it will return normal page
> for this case. And this interface has not been used on windows so far
>  
> This patch introduces qemu_file_get_page_size() to unify the code
> 
> Signed-off-by: Xiao Guangrong 
[...]
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index 914cef5..51437ff 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -340,7 +340,7 @@ static void sigbus_handler(int signal)
>  siglongjmp(sigjump, 1);
>  }
>  
> -static size_t fd_getpagesize(int fd)
> +static size_t fd_getpagesize(int fd, Error **errp)
>  {
>  #ifdef CONFIG_LINUX
>  struct statfs fs;
> @@ -351,7 +351,12 @@ static size_t fd_getpagesize(int fd)
>  ret = fstatfs(fd, );
>  } while (ret != 0 && errno == EINTR);
>  
> -if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) {
> +if (ret) {
> +error_setg_errno(errp, errno, "fstatfs is failed");
> +return 0;
> +}
> +
> +if (fs.f_type == HUGETLBFS_MAGIC) {
>  return fs.f_bsize;
>  }

You are changing os_mem_prealloc() behavior when fstatfs() fails, here.
Have you ensured there are no cases where fstatfs() fails but this code
is still expected to work?

The change looks safe: gethugepagesize() seems to be always called in
the path that would make fd_getpagesize() be called from
os_mem_prealloc(), so allocation would abort much earlier if statfs()
failed. But I haven't confirmed that yet, and I wanted to be sure.


-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 12/35] util: let qemu_fd_getlength support block device

2015-11-06 Thread Eduardo Habkost
On Tue, Nov 03, 2015 at 12:21:30AM +0800, Xiao Guangrong wrote:
> On 11/03/2015 12:11 AM, Vladimir Sementsov-Ogievskiy wrote:
> >On 02.11.2015 12:13, Xiao Guangrong wrote:
> >>lseek can not work for all block devices as the man page says:
> >>| Some devices are incapable of seeking and POSIX does not specify
> >>| which devices must support lseek().
> >>
> >>This patch tries to add the support on Linux by using BLKGETSIZE64
> >>ioctl
> >>
> >>Signed-off-by: Xiao Guangrong 
> >>---
> >>  util/osdep.c | 20 
> >>  1 file changed, 20 insertions(+)
> >>
> >>diff --git a/util/osdep.c b/util/osdep.c
> >>index 5a61e19..b20c793 100644
> >>--- a/util/osdep.c
> >>+++ b/util/osdep.c
> >>@@ -45,6 +45,11 @@
> >>  extern int madvise(caddr_t, size_t, int);
> >>  #endif
> >>+#ifdef CONFIG_LINUX
> >>+#include 
> >>+#include 
> >>+#endif
> >>+
> >>  #include "qemu-common.h"
> >>  #include "qemu/sockets.h"
> >>  #include "qemu/error-report.h"
> >>@@ -433,6 +438,21 @@ int64_t qemu_fd_getlength(int fd)
> >>  {
> >>  int64_t size;
> >>+#ifdef CONFIG_LINUX
> >>+struct stat stat_buf;
> >>+if (fstat(fd, _buf) < 0) {
> >>+return -errno;
> >>+}
> >>+
> >>+if ((S_ISBLK(stat_buf.st_mode)) && !ioctl(fd, BLKGETSIZE64, )) {
> >>+/* The size of block device is larger than max int64_t? */
> >>+if (size < 0) {
> >>+return -EOVERFLOW;
> >>+}
> >>+return size;
> >>+}
> >>+#endif
> >>+
> >>  size = lseek(fd, 0, SEEK_END);
> >>  if (size < 0) {
> >>  return -errno;
> >
> >Reviewed-by: Vladimir Sementsov-Ogievskiy 
> >
> >just a question: is there any use for stat.st_size ? Is it always worse then 
> >lseek?
> 
> The man page says:
> The  st_size field gives the size of the file (if it is a regular file or a 
> symbolic link)
> in bytes.  The size of a symbolic link is the length of the pathname it 
> contains, without a
> terminating null byte.
> 
> So it can not work on symbolic link.

stat() and fstat() will get you information for the file pointed by the
symlink. You will get the symlink information only if you use lstat().

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 12/35] util: let qemu_fd_getlength support block device

2015-11-06 Thread Eduardo Habkost
As this patch affects raw_getlength(), CCing the raw block driver
maintainer and the qemu-block mailing list.

On Mon, Nov 02, 2015 at 05:13:14PM +0800, Xiao Guangrong wrote:
> lseek can not work for all block devices as the man page says:
> | Some devices are incapable of seeking and POSIX does not specify
> | which devices must support lseek().
> 
> This patch tries to add the support on Linux by using BLKGETSIZE64
> ioctl
> 
> Signed-off-by: Xiao Guangrong 
> ---
>  util/osdep.c | 20 
>  1 file changed, 20 insertions(+)
> 
> diff --git a/util/osdep.c b/util/osdep.c
> index 5a61e19..b20c793 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -45,6 +45,11 @@
>  extern int madvise(caddr_t, size_t, int);
>  #endif
>  
> +#ifdef CONFIG_LINUX
> +#include 
> +#include 
> +#endif
> +
>  #include "qemu-common.h"
>  #include "qemu/sockets.h"
>  #include "qemu/error-report.h"
> @@ -433,6 +438,21 @@ int64_t qemu_fd_getlength(int fd)
>  {
>  int64_t size;
>  
> +#ifdef CONFIG_LINUX
> +struct stat stat_buf;
> +if (fstat(fd, _buf) < 0) {
> +return -errno;
> +}
> +
> +if ((S_ISBLK(stat_buf.st_mode)) && !ioctl(fd, BLKGETSIZE64, )) {
> +/* The size of block device is larger than max int64_t? */
> +if (size < 0) {
> +return -EOVERFLOW;
> +}
> +return size;
> +}
> +#endif
> +
>  size = lseek(fd, 0, SEEK_END);
>  if (size < 0) {
>  return -errno;
> -- 
> 1.8.3.1
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 11/35] util: introduce qemu_file_getlength()

2015-11-06 Thread Eduardo Habkost
As this patch affects raw_getlength(), CCing the raw block driver
maintainer and the qemu-block mailing list.

On Mon, Nov 02, 2015 at 05:13:13PM +0800, Xiao Guangrong wrote:
> It is used to get the size of the specified file, also qemu_fd_getlength()
> is introduced to unify the code with raw_getlength() in block/raw-posix.c
> 
> Signed-off-by: Xiao Guangrong 
> ---
>  block/raw-posix.c|  7 +--
>  include/qemu/osdep.h |  2 ++
>  util/osdep.c | 31 +++

I know I was the one who suggested osdep.c, but maybe oslib-posix.c is a
more appropriate place for the new function?


>  3 files changed, 34 insertions(+), 6 deletions(-)
> 
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index 918c756..734e6dd 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -1592,18 +1592,13 @@ static int64_t raw_getlength(BlockDriverState *bs)
>  {
>  BDRVRawState *s = bs->opaque;
>  int ret;
> -int64_t size;
>  
>  ret = fd_open(bs);
>  if (ret < 0) {
>  return ret;
>  }
>  
> -size = lseek(s->fd, 0, SEEK_END);
> -if (size < 0) {
> -return -errno;
> -}
> -return size;
> +return qemu_fd_getlength(s->fd);
>  }
>  #endif
>  
> diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> index dbc17dc..ca4c3fa 100644
> --- a/include/qemu/osdep.h
> +++ b/include/qemu/osdep.h
> @@ -303,4 +303,6 @@ int qemu_read_password(char *buf, int buf_size);
>  pid_t qemu_fork(Error **errp);
>  
>  size_t qemu_file_get_page_size(const char *mem_path, Error **errp);
> +int64_t qemu_fd_getlength(int fd);
> +size_t qemu_file_getlength(const char *file, Error **errp);
>  #endif
> diff --git a/util/osdep.c b/util/osdep.c
> index 0092bb6..5a61e19 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -428,3 +428,34 @@ writev(int fd, const struct iovec *iov, int iov_cnt)
>  return readv_writev(fd, iov, iov_cnt, true);
>  }
>  #endif
> +
> +int64_t qemu_fd_getlength(int fd)
> +{
> +int64_t size;
> +
> +size = lseek(fd, 0, SEEK_END);
> +if (size < 0) {
> +return -errno;
> +}
> +return size;
> +}
> +
> +size_t qemu_file_getlength(const char *file, Error **errp)
> +{
> +int64_t size;
> +int fd = qemu_open(file, O_RDONLY);
> +
> +if (fd < 0) {
> +error_setg_file_open(errp, errno, file);
> +return 0;
> +}
> +
> +size = qemu_fd_getlength(fd);
> +if (size < 0) {
> +error_setg_errno(errp, -size, "can't get size of file %s", file);
> +size = 0;
> +}
> +
> +qemu_close(fd);
> +return size;
> +}
> -- 
> 1.8.3.1
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 12/35] util: let qemu_fd_getlength support block device

2015-11-06 Thread Eduardo Habkost
On Mon, Nov 02, 2015 at 05:13:14PM +0800, Xiao Guangrong wrote:
> lseek can not work for all block devices as the man page says:
> | Some devices are incapable of seeking and POSIX does not specify
> | which devices must support lseek().
> 
> This patch tries to add the support on Linux by using BLKGETSIZE64
> ioctl
> 
> Signed-off-by: Xiao Guangrong 

On which cases is this patch necessary? Do you know any examples of
Linux block devices that won't work with lseek(SEEK_END)?

> ---
>  util/osdep.c | 20 
>  1 file changed, 20 insertions(+)
> 
> diff --git a/util/osdep.c b/util/osdep.c
> index 5a61e19..b20c793 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -45,6 +45,11 @@
>  extern int madvise(caddr_t, size_t, int);
>  #endif
>  
> +#ifdef CONFIG_LINUX
> +#include 
> +#include 
> +#endif
> +
>  #include "qemu-common.h"
>  #include "qemu/sockets.h"
>  #include "qemu/error-report.h"
> @@ -433,6 +438,21 @@ int64_t qemu_fd_getlength(int fd)
>  {
>  int64_t size;
>  
> +#ifdef CONFIG_LINUX
> +struct stat stat_buf;
> +if (fstat(fd, _buf) < 0) {
> +return -errno;
> +}
> +
> +if ((S_ISBLK(stat_buf.st_mode)) && !ioctl(fd, BLKGETSIZE64, )) {
> +/* The size of block device is larger than max int64_t? */
> +if (size < 0) {
> +return -EOVERFLOW;
> +}
> +return size;
> +}
> +#endif
> +
>  size = lseek(fd, 0, SEEK_END);
>  if (size < 0) {
>  return -errno;
> -- 
> 1.8.3.1
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] target-i386: tcg: Accept clwb instruction

2015-11-06 Thread Eduardo Habkost
On Fri, Nov 06, 2015 at 12:04:56PM +0100, Richard Henderson wrote:
> On 11/04/2015 10:24 PM, Eduardo Habkost wrote:
> >Accept the clwb instruction (66 0F AE /6) if its corresponding feature
> >flag is enabled on CPUID[7].
> >
> >Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
> >---
> >  target-i386/translate.c | 13 -
> >  1 file changed, 12 insertions(+), 1 deletion(-)
> >
> >diff --git a/target-i386/translate.c b/target-i386/translate.c
> >index b400d24..bac1685 100644
> >--- a/target-i386/translate.c
> >+++ b/target-i386/translate.c
> >@@ -7716,10 +7716,21 @@ static target_ulong disas_insn(CPUX86State *env, 
> >DisasContext *s,
> >  }
> >  break;
> >  case 5: /* lfence */
> >-case 6: /* mfence */
> >  if ((modrm & 0xc7) != 0xc0 || !(s->cpuid_features & 
> > CPUID_SSE2))
> >  goto illegal_op;
> >  break;
> >+case 6: /* mfence/clwb */
> >+if (s->prefix & PREFIX_DATA) {
> >+/* clwb */
> >+if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_CLWB))
> >+goto illegal_op;
> >+gen_lea_modrm(env, s, modrm);
> 
> You should use gen_nop_modrm here, since we're not going to do anything with
> the address.  Otherwise,
> 
> Reviewed-by: Richard Henderson <r...@twiddle.net>

Thanks! BTW, clflush uses gen_lea_modrm() too, does it do anything with
the address somewhere else?

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 3/3] target-i386: load the migrated vcpu's TSC rate

2015-11-05 Thread Eduardo Habkost
On Mon, Nov 02, 2015 at 05:26:43PM +0800, Haozhong Zhang wrote:
> Set vcpu's TSC rate to the migrated value if the user does not specify a
> TSC rate by cpu option 'tsc-freq' and a migrated TSC rate does exist. If
> KVM supports TSC scaling, guest programs will observe TSC increasing in
> the migrated rate other than the host TSC rate.
> 
> Signed-off-by: Haozhong Zhang 
> ---
>  target-i386/kvm.c | 21 +
>  1 file changed, 21 insertions(+)
> 
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index aae5e58..2be70df 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -3042,6 +3042,27 @@ int kvm_arch_setup_tsc_khz(CPUState *cs)
>  int r;
>  
>  /*
> + * If a TSC rate is migrated and the user does not specify the
> + * vcpu's TSC rate on the destination, the migrated TSC rate will
> + * be used on the destination after the migration.
> + */
> +if (env->tsc_khz_saved && !env->tsc_khz) {
> +if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) {
> +r = kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz_saved);

Why are you duplicating the existing KVM_SET_TSC_KHZ code in
kvm_arch_init_vcpu()?

> +if (r < 0) {
> +fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");

If you want to report errors, please use error_report().

(But I don't think we want to print those warnings. See below.)

> +}
> +} else {
> +r = -1;
> +fprintf(stderr, "KVM doesn't support TSC scaling\n");
> +}
> +if (r < 0) {
> +fprintf(stderr, "Use host TSC frequency instead. "

Did you mean "Using host TSC frequency instead."?

> +"Guest TSC may be inaccurate.\n");
> +}
> +}

This will make QEMU print a warning every single time when migrating to
hosts that don't support TSC scaling, even if the source and destination
hosts already have the same TSC frequency. That means most users will
see a bogus warning, in today's hardware.

Maybe it will be acceptable to print a warning if (and only if) we know
that the host TSC is different from the original TSC frequency.

Considering that we already have code to handle tsc_khz that prints an
error, you don't need to duplicate it. You could handle both
user-provided and migration tsc_khz cases with the same code. With
something like this:

if (env->tsc_khz) { /* may be set by the user, or loaded from incoming 
migration */
r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ?
kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
-ENOTSUP;
if (r < 0) {
int64_t cur_freq = kvm_check_extension(KVM_CAP_GET_TSC_KHZ)) ?
   kvm_vcpu_ioctl(KVM_GET_TSC_KHZ) :
   0;
/* If we know the host frequency, print a warning every time
 * there's a mismatch.
 * If we don't know the host frequency, print a warning only
 * if the user asked for a specific TSC frequency.
 */
if ((cur_freq <= 0 && env->tsc_freq_requested_by_user) ||
(cur_freq > 0 && cur_freq != env->tsc_khz)) {
error_report("warning: TSC frequency mismatch between VM and 
host, and TSC scaling unavailable");
if (env->tsc_freq_set_by_user) {
return r;
}
}
}
}

You will just need a new tsc_freq_requested_by_user field to track if
the TSC frequency was explicitly requested by the user.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/3] target-i386: calculate vcpu's TSC rate to be migrated

2015-11-05 Thread Eduardo Habkost
On Thu, Nov 05, 2015 at 09:30:51AM +0800, Haozhong Zhang wrote:
> On 11/04/15 19:42, Eduardo Habkost wrote:
> > On Mon, Nov 02, 2015 at 05:26:42PM +0800, Haozhong Zhang wrote:
> > > The value of the migrated vcpu's TSC rate is determined as below.
> > >  1. If a TSC rate is specified by the cpu option 'tsc-freq', then this
> > > user-specified value will be used.
> > >  2. If neither a user-specified TSC rate nor a migrated TSC rate is
> > > present, we will use the TSC rate from KVM (returned by
> > > KVM_GET_TSC_KHZ).
> > >  3. Otherwise, we will use the migrated TSC rate.
> > > 
> > > Signed-off-by: Haozhong Zhang <haozhong.zh...@intel.com>
> > [...]
> > > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > > index 64046cb..aae5e58 100644
> > > --- a/target-i386/kvm.c
> > > +++ b/target-i386/kvm.c
> > > @@ -3034,3 +3034,36 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
> > >  {
> > >  abort();
> > >  }
> > > +
> > > +int kvm_arch_setup_tsc_khz(CPUState *cs)
> > > +{
> > > +X86CPU *cpu = X86_CPU(cs);
> > > +CPUX86State *env = >env;
> > > +int r;
> > > +
> > > +/*
> > > + * Prepare vcpu's TSC rate to be migrated.
> > > + *
> > > + * - If the user specifies the TSC rate by cpu option 'tsc-freq',
> > > + *   we will use the user-specified value.
> > > + *
> > > + * - If there is neither user-specified TSC rate nor migrated TSC
> > > + *   rate, we will ask KVM for the TSC rate by calling
> > > + *   KVM_GET_TSC_KHZ.
> > > + *
> > > + * - Otherwise, if there is a migrated TSC rate, we will use the
> > > + *   migrated value.
> > > + */
> > > +if (env->tsc_khz) {
> > > +env->tsc_khz_saved = env->tsc_khz;
> > > +} else if (!env->tsc_khz_saved) {
> > > +r = kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ);
> > > +if (r < 0) {
> > > +fprintf(stderr, "KVM_GET_TSC_KHZ failed\n");
> > > +return r;
> > > +}
> > 
> > The lack of KVM_CAP_GET_TSC_KHZ should make QEMU abort, unless the user
> > is explicitly requesting a more strict mode where the TSC frequency will
> > be guaranteed to never change.
> >
> 
> I agree KVM_CAP_GET_TSC_KHZ should be checked before KVM_GET_TSC_KHZ,
> but I don't think the lack of it should abort QEMU.


Oops, I meant to write: "the lack of KVM_CAP_GET_TSC_KHZ should not
abort QEMU".

> This piece of code
> on the source machine is just to get the TSC frequency to be
> migrated. If it fails, it will leave env->tsc_khz_saved be 0. And
> according to tsc_khz_needed() in patch 1, if env->tsc_khz_saved == 0,
> no TSC frequency will be migrated. So the lack of KVM_CAP_GET_TSC_KHZ
> only hurts the migration and does not need to abort QEMU on the source
> machine.

The lack of KVM_CAP_GET_TSC_KHZ shouldn't prevent migration either. but
it looks your code is not doing that: errors from
kvm_arch_setup_tsc_khz() are being ignored by
do_kvm_cpu_synchronize_post_init(), sorry for the noise.

> 
> > > +env->tsc_khz_saved = r;
> > > +}
> > 
> > Why do you need a separate tsc_khz_saved field, and don't simply use
> > tsc_khz? It would have the additional feature of letting QMP clients
> > query the current TSC rate by asking for the tsc-freq property on CPU
> > objects.
> >
> 
> It's to avoid overriding env->tsc_khz on the destination in the
> migration. I can change this line to
>  env->tsc_khz = env->tsc_khz_saved = r;

You are already avoiding overriding env->tsc_khz, because you use
KVM_GET_TSC_KHZ only if tsc_khz is not set yet. I still don't see why
you need a tsc_khz_saved field that requires duplicating the SET_TSC_KHZ
code, if you could just do this:

if (!env->tsc_khz) {
env->tsc_khz = kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ);
}


> 
> For the additional QMP feature, will the value of tsc-freq property be
> env->tsc_khz? If yes, I guess the above change would be fine?

It may work, but I still don't see the point of duplicating the field
and duplicating the existing SET_TSC_KHZ code.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH] target-i386: enable cflushopt/clwb/pcommit instructions

2015-11-05 Thread Eduardo Habkost
On Thu, Nov 05, 2015 at 08:51:24AM +0100, Richard Henderson wrote:
> On 11/04/2015 08:35 PM, Eduardo Habkost wrote:
> >On Fri, Oct 30, 2015 at 01:54:33PM -0700, Richard Henderson wrote:
> >>On 10/29/2015 12:31 AM, Xiao Guangrong wrote:
> >>>These instructions are used by NVDIMM drivers and the specification
> >>>locates at:
> >>>https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf
> >>>
> >>>There instructions are available on Skylake Server
> >>>
> >>>Signed-off-by: Xiao Guangrong <guangrong.x...@linux.intel.com>
> >>>---
> >>>  target-i386/cpu.c | 8 +---
> >>>  target-i386/cpu.h | 3 +++
> >>>  2 files changed, 8 insertions(+), 3 deletions(-)
> >>
> >>Reviewed-by: Richard Henderson <r...@twiddle.net>
> >>
> >>Although it would be nice to update the comments in translate.c to include 
> >>the
> >>new insns, since they overlap mfence and sfence.  At present we only check 
> >>for
> >>SSE enabled when accepting these; I suppose it's easiest to consider it 
> >>invalid
> >>to specify +clwb,-sse?
> >
> >I assume you want to add the extra SSE requirement to TCG code, not to
> >generic x86 code, then I have no objections.
> 
> I don't really want to add any requirement, just point and laugh at anyone
> who reports an bug for the above condition.
> 
> >But in the case of clwb (/6 with a memory operand, modrm != 0xc0), we
> >are not just requiring SSE2: we are rejecting the instruction unless
> >modrm == 0xc0. That means TCG is rejecting the clwb instruction, so I
> >believe we shouldn't add CLWB to TCG_7_0_EBX_FEATURES yet.
> 
> Hmm, yes.
> 
> I've cleaned up some of this code on a branch, but it didn't get enough
> testing or review this cycle, so it's going to wait for the next.  I see
> you've posted a patch for this, which should be good enough until then.

I will apply this patch without the TCG_*_FEATURES changes until we
change TCG, then. That's OK?

About the TCG patches I have sent, please let me know if they look good
and appropriate for 2.5. This is the first time I have touched TCG code.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 20/35] dimm: get mapped memory region from DIMMDeviceClass->get_memory_region

2015-11-05 Thread Eduardo Habkost
On Mon, Nov 02, 2015 at 05:13:22PM +0800, Xiao Guangrong wrote:
[...]
>  static MemoryRegion *pc_dimm_get_memory_region(DIMMDevice *dimm)
>  {
> -return host_memory_backend_get_memory(dimm->hostmem, _abort);
> +Error *local_err = NULL;
> +MemoryRegion *mr;
> +
> +mr = host_memory_backend_get_memory(dimm->hostmem, _err);
> +
> +/*
> + * plug a pc-dimm device whose backend memory was not properly
> + * initialized?
> + */
> +assert(!local_err && mr);

I don't know if you are going to remove the errp parameter in the next
version, but if you want to simply abort in case an error is reported by
a function, you can use _abort.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[kvm-unit-test RFC] x86: Memory instructions test case

2015-11-04 Thread Eduardo Habkost
Quickly hacked test case for memory instructions (clflush, mfence,
sfence, lfence, clflushopt, clwb, pcommit), that simply checks for #UD
exceptions.

This was useful to test TCG handling of those instructions.

The "fake clwb" part will probably break once a new instruction use
those opcodes.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 config/config-x86-common.mak |  2 +
 config/config-x86_64.mak |  2 +-
 x86/memory.c | 88 
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 x86/memory.c

diff --git a/config/config-x86-common.mak b/config/config-x86-common.mak
index c2f9908..b89684d 100644
--- a/config/config-x86-common.mak
+++ b/config/config-x86-common.mak
@@ -108,6 +108,8 @@ $(TEST_DIR)/vmx.elf: $(cstart.o) $(TEST_DIR)/vmx.o 
$(TEST_DIR)/vmx_tests.o
 
 $(TEST_DIR)/debug.elf: $(cstart.o) $(TEST_DIR)/debug.o
 
+$(TEST_DIR)/memory.elf: $(cstart.o) $(TEST_DIR)/memory.o
+
 arch_clean:
$(RM) $(TEST_DIR)/*.o $(TEST_DIR)/*.flat $(TEST_DIR)/*.elf \
$(TEST_DIR)/.*.d lib/x86/.*.d
diff --git a/config/config-x86_64.mak b/config/config-x86_64.mak
index 7d4eb34..ec4bded 100644
--- a/config/config-x86_64.mak
+++ b/config/config-x86_64.mak
@@ -7,7 +7,7 @@ tests = $(TEST_DIR)/access.flat $(TEST_DIR)/apic.flat \
  $(TEST_DIR)/emulator.flat $(TEST_DIR)/idt_test.flat \
  $(TEST_DIR)/xsave.flat $(TEST_DIR)/rmap_chain.flat \
  $(TEST_DIR)/pcid.flat $(TEST_DIR)/debug.flat \
- $(TEST_DIR)/ioapic.flat
+ $(TEST_DIR)/ioapic.flat $(TEST_DIR)/memory.flat
 tests += $(TEST_DIR)/svm.flat
 tests += $(TEST_DIR)/vmx.flat
 tests += $(TEST_DIR)/tscdeadline_latency.flat
diff --git a/x86/memory.c b/x86/memory.c
new file mode 100644
index 000..cd1eb46
--- /dev/null
+++ b/x86/memory.c
@@ -0,0 +1,88 @@
+/*
+ * Test for x86 cache and memory instructions
+ *
+ * Copyright (c) 2015 Red Hat Inc
+ *
+ * Authors:
+ *  Eduardo Habkost <ehabk...@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include "libcflat.h"
+#include "desc.h"
+#include "processor.h"
+
+static long target;
+static volatile int ud;
+static volatile int isize;
+
+static void handle_ud(struct ex_regs *regs)
+{
+   ud = 1;
+   regs->rip += isize;
+}
+
+int main(int ac, char **av)
+{
+   struct cpuid cpuid7, cpuid1;
+   int xfail;
+
+   setup_idt();
+   handle_exception(UD_VECTOR, handle_ud);
+
+   cpuid1 = cpuid(1);
+   cpuid7 = cpuid_indexed(7, 0);
+
+   /* 3-byte instructions: */
+   isize = 3;
+
+   xfail = !(cpuid1.d & (1U << 19)); /* CLFLUSH */
+   ud = 0;
+   asm volatile("clflush (%0)" : : "b" ());
+   report_xfail("clflush", xfail, ud == 0);
+
+   xfail = !(cpuid1.d & (1U << 25)); /* SSE */
+   ud = 0;
+   asm volatile("sfence");
+   report_xfail("sfence", xfail, ud == 0);
+
+   xfail = !(cpuid1.d & (1U << 26)); /* SSE2 */
+   ud = 0;
+   asm volatile("lfence");
+   report_xfail("lfence", xfail, ud == 0);
+
+   ud = 0;
+   asm volatile("mfence");
+   report_xfail("mfence", xfail, ud == 0);
+
+   /* 4-byte instructions: */
+   isize = 4;
+
+   xfail = !(cpuid7.b & (1U << 23)); /* CLFLUSHOPT */
+   ud = 0;
+   /* clflushopt (%rbx): */
+   asm volatile(".byte 0x66, 0x0f, 0xae, 0x3b" : : "b" ());
+   report_xfail("clflushopt", xfail, ud == 0);
+
+   xfail = !(cpuid7.b & (1U << 24)); /* CLWB */
+   ud = 0;
+   /* clwb (%rbx): */
+   asm volatile(".byte 0x66, 0x0f, 0xae, 0x33" : : "b" ());
+   report_xfail("clwb", xfail, ud == 0);
+
+   ud = 0;
+   /* clwb requires a memory operand, the following is NOT a valid
+* CLWB instruction (modrm == 0xF0).
+*/
+   asm volatile(".byte 0x66, 0x0f, 0xae, 0xf0");
+   report("fake clwb", ud);
+
+   xfail = !(cpuid7.b & (1U << 22)); /* PCOMMIT */
+   ud = 0;
+   /* pcommit: */
+   asm volatile(".byte 0x66, 0x0f, 0xae, 0xf8");
+   report_xfail("pcommit", xfail, ud == 0);
+
+   return report_summary();
+}
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/3] target-i386: calculate vcpu's TSC rate to be migrated

2015-11-04 Thread Eduardo Habkost
On Mon, Nov 02, 2015 at 05:26:42PM +0800, Haozhong Zhang wrote:
> The value of the migrated vcpu's TSC rate is determined as below.
>  1. If a TSC rate is specified by the cpu option 'tsc-freq', then this
> user-specified value will be used.
>  2. If neither a user-specified TSC rate nor a migrated TSC rate is
> present, we will use the TSC rate from KVM (returned by
> KVM_GET_TSC_KHZ).
>  3. Otherwise, we will use the migrated TSC rate.
> 
> Signed-off-by: Haozhong Zhang 
[...]
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 64046cb..aae5e58 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -3034,3 +3034,36 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
>  {
>  abort();
>  }
> +
> +int kvm_arch_setup_tsc_khz(CPUState *cs)
> +{
> +X86CPU *cpu = X86_CPU(cs);
> +CPUX86State *env = >env;
> +int r;
> +
> +/*
> + * Prepare vcpu's TSC rate to be migrated.
> + *
> + * - If the user specifies the TSC rate by cpu option 'tsc-freq',
> + *   we will use the user-specified value.
> + *
> + * - If there is neither user-specified TSC rate nor migrated TSC
> + *   rate, we will ask KVM for the TSC rate by calling
> + *   KVM_GET_TSC_KHZ.
> + *
> + * - Otherwise, if there is a migrated TSC rate, we will use the
> + *   migrated value.
> + */
> +if (env->tsc_khz) {
> +env->tsc_khz_saved = env->tsc_khz;
> +} else if (!env->tsc_khz_saved) {
> +r = kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ);
> +if (r < 0) {
> +fprintf(stderr, "KVM_GET_TSC_KHZ failed\n");
> +return r;
> +}

The lack of KVM_CAP_GET_TSC_KHZ should make QEMU abort, unless the user
is explicitly requesting a more strict mode where the TSC frequency will
be guaranteed to never change.

> +env->tsc_khz_saved = r;
> +}

Why do you need a separate tsc_khz_saved field, and don't simply use
tsc_khz? It would have the additional feature of letting QMP clients
query the current TSC rate by asking for the tsc-freq property on CPU
objects.


> +
> +return 0;
> +}

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] target-i386: tcg: Accept clwb instruction

2015-11-04 Thread Eduardo Habkost
Accept the clwb instruction (66 0F AE /6) if its corresponding feature
flag is enabled on CPUID[7].

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/translate.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/target-i386/translate.c b/target-i386/translate.c
index b400d24..bac1685 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -7716,10 +7716,21 @@ static target_ulong disas_insn(CPUX86State *env, 
DisasContext *s,
 }
 break;
 case 5: /* lfence */
-case 6: /* mfence */
 if ((modrm & 0xc7) != 0xc0 || !(s->cpuid_features & CPUID_SSE2))
 goto illegal_op;
 break;
+case 6: /* mfence/clwb */
+if (s->prefix & PREFIX_DATA) {
+/* clwb */
+if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_CLWB))
+goto illegal_op;
+gen_lea_modrm(env, s, modrm);
+} else {
+/* mfence */
+if ((modrm & 0xc7) != 0xc0 || !(s->cpuid_features & 
CPUID_SSE2))
+goto illegal_op;
+}
+break;
 case 7: /* sfence / clflush */
 if ((modrm & 0xc7) == 0xc0) {
 /* sfence */
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] target-i386: tcg: Check right CPUID bits for clflushopt/pcommit

2015-11-04 Thread Eduardo Habkost
Detect the clflushopt and pcommit instructions and check their
corresponding feature flags, instead of checking CPUID_SSE and
CPUID_CLFLUSH.

Signed-off-by: Eduardo Habkost <ehabk...@redhat.com>
---
 target-i386/translate.c | 28 
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/target-i386/translate.c b/target-i386/translate.c
index bac1685..a938967 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -7731,16 +7731,28 @@ static target_ulong disas_insn(CPUX86State *env, 
DisasContext *s,
 goto illegal_op;
 }
 break;
-case 7: /* sfence / clflush */
+case 7: /* sfence / clflush / clflushopt / pcommit */
 if ((modrm & 0xc7) == 0xc0) {
-/* sfence */
-/* XXX: also check for cpuid_ext2_features & CPUID_EXT2_EMMX */
-if (!(s->cpuid_features & CPUID_SSE))
-goto illegal_op;
+if (s->prefix & PREFIX_DATA) {
+/* pcommit */
+if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_PCOMMIT))
+goto illegal_op;
+} else {
+/* sfence */
+/* XXX: also check for cpuid_ext2_features & 
CPUID_EXT2_EMMX */
+if (!(s->cpuid_features & CPUID_SSE))
+goto illegal_op;
+}
 } else {
-/* clflush */
-if (!(s->cpuid_features & CPUID_CLFLUSH))
-goto illegal_op;
+if (s->prefix & PREFIX_DATA) {
+/* clflushopt */
+if (!(s->cpuid_7_0_ebx_features & 
CPUID_7_0_EBX_CLFLUSHOPT))
+goto illegal_op;
+} else {
+/* clflush */
+if (!(s->cpuid_features & CPUID_CLFLUSH))
+goto illegal_op;
+}
 gen_lea_modrm(env, s, modrm);
 }
 break;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] target-i386: tcg: Handle clflushopt/clwb/pcommit instructions

2015-11-04 Thread Eduardo Habkost
This series makes TCG accept the clwb instruction, and make it check the right
CPUID bits when handling clflushopt and pcommit.

Tested using the kvm-unit-test patch I sent earlier today:
  Subject: [kvm-unit-test RFC] x86: Memory instructions test case
  Message-Id: <1446672079-8549-1-git-send-email-ehabk...@redhat.com>

Eduardo Habkost (2):
  target-i386: tcg: Accept clwb instruction
  target-i386: tcg: Check right CPUID bits for clflushopt/pcommit

 target-i386/translate.c | 39 +++
 1 file changed, 31 insertions(+), 8 deletions(-)

-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH] target-i386: enable cflushopt/clwb/pcommit instructions

2015-11-04 Thread Eduardo Habkost
On Fri, Oct 30, 2015 at 01:54:33PM -0700, Richard Henderson wrote:
> On 10/29/2015 12:31 AM, Xiao Guangrong wrote:
> > These instructions are used by NVDIMM drivers and the specification
> > locates at:
> > https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf
> > 
> > There instructions are available on Skylake Server
> > 
> > Signed-off-by: Xiao Guangrong 
> > ---
> >  target-i386/cpu.c | 8 +---
> >  target-i386/cpu.h | 3 +++
> >  2 files changed, 8 insertions(+), 3 deletions(-)
> 
> Reviewed-by: Richard Henderson 
> 
> Although it would be nice to update the comments in translate.c to include the
> new insns, since they overlap mfence and sfence.  At present we only check for
> SSE enabled when accepting these; I suppose it's easiest to consider it 
> invalid
> to specify +clwb,-sse?

I assume you want to add the extra SSE requirement to TCG code, not to
generic x86 code, then I have no objections. Your conclusion seems to be
right for pcommit and clflushopt, if I checked the opcodes and encoding
properly. In the case of pcommit (/7, modrm == 0xf8), we check for SSE;
in the case of clflushopt (/7 with a memory operand, modrm != 0xf8), we
check for CLFLUSH.

But in the case of clwb (/6 with a memory operand, modrm != 0xc0), we
are not just requiring SSE2: we are rejecting the instruction unless
modrm == 0xc0. That means TCG is rejecting the clwb instruction, so I
believe we shouldn't add CLWB to TCG_7_0_EBX_FEATURES yet.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 08/35] exec: allow memory to be allocated from any kind of path

2015-11-04 Thread Eduardo Habkost
On Wed, Nov 04, 2015 at 11:12:41AM +0800, Xiao Guangrong wrote:
> On 11/04/2015 07:00 AM, Eduardo Habkost wrote:
> >On Mon, Nov 02, 2015 at 05:13:10PM +0800, Xiao Guangrong wrote:
> >>Currently file_ram_alloc() is designed for hugetlbfs, however, the memory
> >>of nvdimm can come from either raw pmem device eg, /dev/pmem, or the file
> >>locates at DAX enabled filesystem
> >>
> >>So this patch let it work on any kind of path
> >>
> >>Signed-off-by: Xiao Guangrong <guangrong.x...@linux.intel.com>
> >>---
> >>  exec.c | 24 
> >>  1 file changed, 12 insertions(+), 12 deletions(-)
> >>
> >>diff --git a/exec.c b/exec.c
> >>index 9de38be..9075f4d 100644
> >>--- a/exec.c
> >>+++ b/exec.c
> >>@@ -1184,25 +1184,25 @@ static void *file_ram_alloc(RAMBlock *block,
> >>  char *c;
> >>  void *area;
> >>  int fd;
> >>-uint64_t hpagesize;
> >>+uint64_t pagesize;
> >>  Error *local_err = NULL;
> >>
> >>-hpagesize = qemu_file_get_page_size(path, _err);
> >>+pagesize = qemu_file_get_page_size(path, _err);
> >>  if (local_err) {
> >>  error_propagate(errp, local_err);
> >>  goto error;
> >>  }
> >>
> >>-if (hpagesize == getpagesize()) {
> >>-fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
> >>+if (pagesize == getpagesize()) {
> >>+fprintf(stderr, "Memory is not allocated from HugeTlbfs.\n");
> >
> >If the point of this patch is to allow file_ram_alloc() to not be
> >specific to hugetlbfs anymore, this warning can simply go away.
> >
> >(And in case if you really want to keep the warning, I don't see the
> >point of the changes you made to it.)
> >
> 
> This is the history why we did it like this:
> https://lists.gnu.org/archive/html/qemu-devel/2015-10/msg02862.html

The rule I am trying to follow is simple: if there are valid use cases
(e.g. nvdimm, tmpfs) where the warning will be printed every single time
QEMU runs, the warning is not appropriate.

If you really want to keep a warning, please make it not be printed on
all other valid use cases (nvdimm and tmpfs). Personally, I don't think
adding those additional checks would be worth the trouble, that's why I
suggest removing the warning.

> 
> Q:
> | What this *actually* is trying to warn against is that
> | mapping a regular file (as opposed to hugetlbfs)
> | means transparent huge pages don't work.

I don't think the author of that warning even thought about transparent
huge pages (did THP even existed when it was written?). I believe they
just assumed that the only reason for using -mem-path would be hugetlbfs
and wanted to warn about it. That assumption isn't true anymore.

> 
> | So I don't think we should drop this warning completely.
> | Either let's add the nvdimm magic, or simply check the
> | page size.
> 
> A:
> | Check the page size sounds good, will check:
> | if (pagesize != getpagesize()) {
> |...print something...
> |}
> 
> | I agree with you that showing the info is needed, however,
> | 'Warning' might scare some users, how about drop this word or
> | just show “Memory is not allocated from HugeTlbfs”?

With "warning:", I know it may be OK to do what I am doing and the
software is just trying to warn me. If there's no "warning:", I don't
even know if something is really broken in my config, or if it's just a
warning, and I would be very confused.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 11/35] util: introduce qemu_file_getlength()

2015-11-04 Thread Eduardo Habkost
On Wed, Nov 04, 2015 at 11:17:09AM +0800, Xiao Guangrong wrote:
> 
> 
> On 11/04/2015 07:21 AM, Eduardo Habkost wrote:
> >On Mon, Nov 02, 2015 at 05:13:13PM +0800, Xiao Guangrong wrote:
> >[...]
> >>+size_t qemu_file_getlength(const char *file, Error **errp)
> >>+{
> >>+int64_t size;
> >[...]
> >>+return size;
> >
> >Can you guarantee that SIZE_MAX >= INT64_MAX on all platforms supported
> >by QEMU?
> >
> 
> Actually, this function is abstracted from the common function, 
> raw_getlength(),
> in raw-posix.c whose return value is int64_t.
> 
> And i think int64_t is large enough for block devices.

int64_t should be enough, but I don't know if size_t is large enough on
all platforms.

I believe it's going to be either one of those cases:

* If you are absolutely sure SIZE_MAX >= INT64_MAX on all platforms,
  please explain why (and maybe add a QEMU_BUILD_BUG_ON?). (I don't
  think this will be the case)
* If SIZE_MAX < INT64_MAX is possible but you believe
  size <= SIZE_MAX is always true here, please explain why (and maybe
  add an assert()).
* Otherwise, we need to set an appropriate error if size > SIZE_MAX
  or change the type of qemu_file_getlength(). What about making it
  uint64_t?

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 08/35] exec: allow memory to be allocated from any kind of path

2015-11-03 Thread Eduardo Habkost
On Mon, Nov 02, 2015 at 05:13:10PM +0800, Xiao Guangrong wrote:
> Currently file_ram_alloc() is designed for hugetlbfs, however, the memory
> of nvdimm can come from either raw pmem device eg, /dev/pmem, or the file
> locates at DAX enabled filesystem
> 
> So this patch let it work on any kind of path
> 
> Signed-off-by: Xiao Guangrong 
> ---
>  exec.c | 24 
>  1 file changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/exec.c b/exec.c
> index 9de38be..9075f4d 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -1184,25 +1184,25 @@ static void *file_ram_alloc(RAMBlock *block,
>  char *c;
>  void *area;
>  int fd;
> -uint64_t hpagesize;
> +uint64_t pagesize;
>  Error *local_err = NULL;
>  
> -hpagesize = qemu_file_get_page_size(path, _err);
> +pagesize = qemu_file_get_page_size(path, _err);
>  if (local_err) {
>  error_propagate(errp, local_err);
>  goto error;
>  }
>  
> -if (hpagesize == getpagesize()) {
> -fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path);
> +if (pagesize == getpagesize()) {
> +fprintf(stderr, "Memory is not allocated from HugeTlbfs.\n");

If the point of this patch is to allow file_ram_alloc() to not be
specific to hugetlbfs anymore, this warning can simply go away.

(And in case if you really want to keep the warning, I don't see the
point of the changes you made to it.)

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 11/35] util: introduce qemu_file_getlength()

2015-11-03 Thread Eduardo Habkost
On Mon, Nov 02, 2015 at 05:13:13PM +0800, Xiao Guangrong wrote:
[...]
> +size_t qemu_file_getlength(const char *file, Error **errp)
> +{
> +int64_t size;
[...]
> +return size;

Can you guarantee that SIZE_MAX >= INT64_MAX on all platforms supported
by QEMU?

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 07/33] util: introduce qemu_file_get_page_size()

2015-10-31 Thread Eduardo Habkost
On Sat, Oct 31, 2015 at 04:09:56PM +0800, Xiao Guangrong wrote:
> On 10/30/2015 11:54 PM, Eduardo Habkost wrote:
> >On Fri, Oct 30, 2015 at 01:56:01PM +0800, Xiao Guangrong wrote:
> >>There are three places use the some logic to get the page size on
> >>the file path or file fd
> >>
> >>This patch introduces qemu_file_get_page_size() to unify the code
> >>
> >>Signed-off-by: Xiao Guangrong <guangrong.x...@linux.intel.com>
[...]
> >>diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> >>index ac70f08..c661f1c 100644
> >>--- a/target-ppc/kvm.c
> >>+++ b/target-ppc/kvm.c
> >>@@ -308,28 +308,13 @@ static void kvm_get_smmu_info(PowerPCCPU *cpu, struct 
> >>kvm_ppc_smmu_info *info)
> >>
> >>  static long gethugepagesize(const char *mem_path)
> >>  {
> >>-struct statfs fs;
> >>-int ret;
> >>-
> >>-do {
> >>-ret = statfs(mem_path, );
> >>-} while (ret != 0 && errno == EINTR);
> >>+long size = qemu_file_get_page_size(mem_path);
> >>
> >>-if (ret != 0) {
> >>-fprintf(stderr, "Couldn't statfs() memory path: %s\n",
> >>-strerror(errno));
> >>+if (!size) {
> >>  exit(1);
> >>  }
> >>
> >>-#define HUGETLBFS_MAGIC   0x958458f6
> >>-
> >>-if (fs.f_type != HUGETLBFS_MAGIC) {
> >>-/* Explicit mempath, but it's ordinary pages */
> >>-return getpagesize();
> >>-}
> >>-
> >>-/* It's hugepage, return the huge page size */
> >>-return fs.f_bsize;
> >>+return size;
> >>  }
> >
> >Why are you changing target-ppc/kvm.c:gethugepagesize() to use the new
> >funtion, but not the copy at exec.c? To make it simpler, we could
> >eliminate both gethugepagesize() functions completely and replace them
> >with qemu_file_get_page_size() calls (maybe as part of this patch, maybe
> >in a separate patch, I'm not sure).
> >
> 
> The gethugepagesize() in exec.c will be eliminated in later patch :).

That's true. I was just expecting to see the change that eliminates
gethugepagesize() in exec.c here instead of patch 08/33. Simple cleanups
that just eliminate code duplication without change in behavior are
easier to review and more likely to be included before the rest of the
series.


> 
> And the gethugepagesize() in ppc platform has error handling logic
> and has multiple caller. It's not so bad to keep it.

Well, in case there's still room for cleanup in the ppc code, it may be
done later. No problem.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v6 11/33] hostmem-file: use whole file size if possible

2015-10-31 Thread Eduardo Habkost
On Sat, Oct 31, 2015 at 04:46:05PM +0800, Xiao Guangrong wrote:
> On 10/31/2015 01:30 AM, Eduardo Habkost wrote:
> >On Fri, Oct 30, 2015 at 01:56:05PM +0800, Xiao Guangrong wrote:
> >>Use the whole file size if @size is not specified which is useful
> >>if we want to directly pass a file to guest
> >>
> >>Signed-off-by: Xiao Guangrong <guangrong.x...@linux.intel.com>
> >>---
> >>  backends/hostmem-file.c | 48 
> >> 
> >>  1 file changed, 44 insertions(+), 4 deletions(-)
> >>
> >>diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
> >>index 9097a57..e1bc9ff 100644
> >>--- a/backends/hostmem-file.c
> >>+++ b/backends/hostmem-file.c
> >>@@ -9,6 +9,9 @@
> >>   * This work is licensed under the terms of the GNU GPL, version 2 or 
> >> later.
> >>   * See the COPYING file in the top-level directory.
> >>   */
> >>+#include 
> >>+#include 
> >
> >This code needs to build on other platforms too. e.g. using mingw32:
> 
> Err... You did it on Windows? It's surprised that the file is only built
> on Linux:
> common-obj-$(CONFIG_LINUX) += hostmem-file.o
> 
> How it can happen...

I did it using mingw32. I don't remember what I did, but I probably tried
something stupid to test just the build of hostmem-file.o and didn't notice it
was conditional on CONFIG_LINUX. Sorry for the noise.

> 
> >
> >   CCbackends/hostmem-file.o
> >/home/ehabkost/rh/proj/virt/qemu/backends/hostmem-file.c:12:23: fatal error: 
> >sys/ioctl.h: No such file or directory
> >  #include 
> >^
> >compilation terminated.
> >/home/ehabkost/rh/proj/virt/qemu/rules.mak:57: recipe for target 
> >'backends/hostmem-file.o' failed
> >make: *** [backends/hostmem-file.o] Error 1
> >
> >
> >>+
> >>  #include "qemu-common.h"
> >>  #include "sysemu/hostmem.h"
> >>  #include "sysemu/sysemu.h"
> >>@@ -33,20 +36,57 @@ struct HostMemoryBackendFile {
> >>  char *mem_path;
> >>  };
> >>
> >>+static uint64_t get_file_size(const char *file)
> >>+{
> >>+struct stat stat_buf;
> >>+uint64_t size = 0;
> >>+int fd;
> >>+
> >>+fd = open(file, O_RDONLY);
> >>+if (fd < 0) {
> >>+return 0;
> >>+}
> >>+
> >>+if (stat(file, _buf) < 0) {
> >>+goto exit;
> >>+}
> >>+
> >>+if ((S_ISBLK(stat_buf.st_mode)) && !ioctl(fd, BLKGETSIZE64, )) {
> >>+goto exit;
> >>+}
> >>+

I have another question: if our block device code at raw-posix.c doesn't need
the Linux-specific BLKGETSIZE64 call, why exactly do we need it in
hostmem-file.c? In which cases it would break without BLKGETSIZE64?

> >>+size = lseek(fd, 0, SEEK_END);
> >>+if (size == -1) {
> >>+size = 0;
> >>+}
> >>+exit:
> >>+close(fd);
> >>+return size;
> >>+}
> >
> >This code seems to duplicate what block/raw-posix.c:raw_getlength() does
> >(except for the BLKGETSIZE64 part). Have you considered using the same
> >code for both?
> >
> >We can probably move all the raw-posix.c raw_getlength(BlockDriverState
> >*bs) code to fd_getlength(int fd) functions (on osdep.c?), and just
> >implement raw-posix.c:raw_getlength(s) as fd_getlength(s->fd).
> >
> 
> Actually, Paolo has the same suggestion before... but
> 
> | The function you pointed out is really complex - it mixed 9 platforms and 
> each
> | platform has its own specific implementation. It is hard for us to verify 
> the
> | change.
> |
> | I'd prefer to make it for Linux specific first then share it to other 
> platforms
> | if it's needed in the future.
> 
> I do not know if it's really worth doing it. :(

If hostmem-file.c is Linux-specific we don't need to move or reuse the code for
all the other 9 platforms right now, that's true. But now you are adding a new
arch-specific function that does exactly the same thing in a different file to
the mix. What if somebody want to make hostmem-file.c work in another platform
in the future, and begin to duplicate the same #ifdef mess from raw-posix.c?

I was considering this:

1) Move the arch-independent raw_getlength() code to fd_getlength() (at
osdep.c, maybe?), as:

int64_t fd_getlength(int fd)
{
int64_t size;

size = lseek(s->fd, 0, SEEK_END);
if (size < 0) {
return -errno;

Re: [Qemu-devel] [PATCH v6 08/33] exec: allow memory to be allocated from any kind of path

2015-10-31 Thread Eduardo Habkost
On Sat, Oct 31, 2015 at 03:44:39PM +0800, Xiao Guangrong wrote:
> On 10/30/2015 10:04 PM, Vladimir Sementsov-Ogievskiy wrote:
> >On 30.10.2015 08:56, Xiao Guangrong wrote:
> >>Currently file_ram_alloc() is designed for hugetlbfs, however, the memory
> >>of nvdimm can come from either raw pmem device eg, /dev/pmem, or the file
> >>locates at DAX enabled filesystem
> >>
> >>So this patch let it work on any kind of path
> >>
> >>Signed-off-by: Xiao Guangrong 
[...]
> >>-block->mr->align = hpagesize;
> >>-if (memory < hpagesize) {
> >>+if (pagesize == getpagesize()) {
> >>+fprintf(stderr, "Memory is not allocated from HugeTlbfs.\n");
> >>+}
> >
> >It is strange to see this warning every time.
> >
> >Shouldn't the differentiation be done explicitly in command line? May be 
> >separate option mem-tlb, or
> >separate flag tlbfs=on, or for new feature - new option mem-file, or 
> >prefixes for paths (tlbfs://,
> >file://).. Or the other way to not mix things but split them.
> 
> This is just a reminder to users. Currently Qemu do not stop user to append a 
> regular file
> for its backend memory, particularly, hugetlbfs is not the only way to use 
> HugePage for
> the THP-enabled system.
> 
> We can implement your idea as a independent patchset in the future.

Isn't the whole point of this patch to make the code not specific to
hugetlbfs anymore? It makes the warning obsolete.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 07/33] util: introduce qemu_file_get_page_size()

2015-10-30 Thread Eduardo Habkost
On Fri, Oct 30, 2015 at 01:56:01PM +0800, Xiao Guangrong wrote:
> There are three places use the some logic to get the page size on
> the file path or file fd
> 
> This patch introduces qemu_file_get_page_size() to unify the code
> 
> Signed-off-by: Xiao Guangrong 
[...]
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index 914cef5..ad94c5a 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -360,6 +360,22 @@ static size_t fd_getpagesize(int fd)
>  return getpagesize();
>  }
>  
> +size_t qemu_file_get_page_size(const char *path)
> +{
> +size_t size = 0;
> +int fd = qemu_open(path, O_RDONLY);
> +
> +if (fd < 0) {
> +fprintf(stderr, "Could not open %s.\n", path);
> +goto exit;

Have you considered using a Error** argument here?

> +}
> +
> +size = fd_getpagesize(fd);
> +qemu_close(fd);
> +exit:
> +return size;
> +}
> +
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index ac70f08..c661f1c 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -308,28 +308,13 @@ static void kvm_get_smmu_info(PowerPCCPU *cpu, struct 
> kvm_ppc_smmu_info *info)
>  
>  static long gethugepagesize(const char *mem_path)
>  {
> -struct statfs fs;
> -int ret;
> -
> -do {
> -ret = statfs(mem_path, );
> -} while (ret != 0 && errno == EINTR);
> +long size = qemu_file_get_page_size(mem_path);
>  
> -if (ret != 0) {
> -fprintf(stderr, "Couldn't statfs() memory path: %s\n",
> -strerror(errno));
> +if (!size) {
>  exit(1);
>  }
>  
> -#define HUGETLBFS_MAGIC   0x958458f6
> -
> -if (fs.f_type != HUGETLBFS_MAGIC) {
> -/* Explicit mempath, but it's ordinary pages */
> -return getpagesize();
> -}
> -
> -/* It's hugepage, return the huge page size */
> -return fs.f_bsize;
> +return size;
>  }

Why are you changing target-ppc/kvm.c:gethugepagesize() to use the new
funtion, but not the copy at exec.c? To make it simpler, we could
eliminate both gethugepagesize() functions completely and replace them
with qemu_file_get_page_size() calls (maybe as part of this patch, maybe
in a separate patch, I'm not sure).

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v6 11/33] hostmem-file: use whole file size if possible

2015-10-30 Thread Eduardo Habkost
On Fri, Oct 30, 2015 at 01:56:05PM +0800, Xiao Guangrong wrote:
> Use the whole file size if @size is not specified which is useful
> if we want to directly pass a file to guest
> 
> Signed-off-by: Xiao Guangrong 
> ---
>  backends/hostmem-file.c | 48 
>  1 file changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
> index 9097a57..e1bc9ff 100644
> --- a/backends/hostmem-file.c
> +++ b/backends/hostmem-file.c
> @@ -9,6 +9,9 @@
>   * This work is licensed under the terms of the GNU GPL, version 2 or later.
>   * See the COPYING file in the top-level directory.
>   */
> +#include 
> +#include 

This code needs to build on other platforms too. e.g. using mingw32:

  CCbackends/hostmem-file.o
/home/ehabkost/rh/proj/virt/qemu/backends/hostmem-file.c:12:23: fatal error: 
sys/ioctl.h: No such file or directory
 #include 
   ^
compilation terminated.
/home/ehabkost/rh/proj/virt/qemu/rules.mak:57: recipe for target 
'backends/hostmem-file.o' failed
make: *** [backends/hostmem-file.o] Error 1


> +
>  #include "qemu-common.h"
>  #include "sysemu/hostmem.h"
>  #include "sysemu/sysemu.h"
> @@ -33,20 +36,57 @@ struct HostMemoryBackendFile {
>  char *mem_path;
>  };
>  
> +static uint64_t get_file_size(const char *file)
> +{
> +struct stat stat_buf;
> +uint64_t size = 0;
> +int fd;
> +
> +fd = open(file, O_RDONLY);
> +if (fd < 0) {
> +return 0;
> +}
> +
> +if (stat(file, _buf) < 0) {
> +goto exit;
> +}
> +
> +if ((S_ISBLK(stat_buf.st_mode)) && !ioctl(fd, BLKGETSIZE64, )) {
> +goto exit;
> +}
> +
> +size = lseek(fd, 0, SEEK_END);
> +if (size == -1) {
> +size = 0;
> +}
> +exit:
> +close(fd);
> +return size;
> +}

This code seems to duplicate what block/raw-posix.c:raw_getlength() does
(except for the BLKGETSIZE64 part). Have you considered using the same
code for both?

We can probably move all the raw-posix.c raw_getlength(BlockDriverState
*bs) code to fd_getlength(int fd) functions (on osdep.c?), and just
implement raw-posix.c:raw_getlength(s) as fd_getlength(s->fd).

> +
>  static void
>  file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
>  {
>  HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(backend);
>  
> -if (!backend->size) {
> -error_setg(errp, "can't create backend with size 0");
> -return;
> -}
>  if (!fb->mem_path) {
>  error_setg(errp, "mem-path property not set");
>  return;
>  }
>  
> +if (!backend->size) {
> +/*
> + * use the whole file size if @size is not specified.
> + */
> +backend->size = get_file_size(fb->mem_path);
> +}
> +
> +if (!backend->size) {
> +error_setg(errp, "failed to get file size for %s, can't create "
> + "backend on it", mem_path);
> +return;
> +}
> +
>  backend->force_prealloc = mem_prealloc;
>  memory_region_init_ram_from_file(>mr, OBJECT(backend),
>   object_get_canonical_path(OBJECT(backend)),
> -- 
> 1.8.3.1
> 
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 0/3] target-i386: save/restore vcpu's TSC rate during migration

2015-10-26 Thread Eduardo Habkost
On Mon, Oct 26, 2015 at 10:09:13AM +0800, haozhong.zh...@intel.com wrote:
> On Fri, Oct 23, 2015 at 12:45:13PM -0200, Eduardo Habkost wrote:
> > On Fri, Oct 23, 2015 at 10:27:27AM +0800, Haozhong Zhang wrote:
> > > On Thu, Oct 22, 2015 at 04:45:21PM -0200, Eduardo Habkost wrote:
> > > > On Tue, Oct 20, 2015 at 03:22:51PM +0800, Haozhong Zhang wrote:
> > > > > This patchset enables QEMU to save/restore vcpu's TSC rate during the
> > > > > migration. When cooperating with KVM which supports TSC scaling, guest
> > > > > programs can observe a consistent guest TSC rate even though they are
> > > > > migrated among machines with different host TSC rates.
> > > > > 
> > > > > A pair of cpu options 'save-tsc-freq' and 'load-tsc-freq' are added to
> > > > > control the migration of vcpu's TSC rate.
> > > > 
> > > > The requirements and goals aren't clear to me. I see two possible use
> > > > cases, here:
> > > > 
> > > > 1) Best effort to keep TSC frequency constant if possible (but not
> > > >aborting migration if not possible). This would be an interesting
> > > >default, but a bit unpredictable.
> > > > 2) Strictly ensuring TSC frequency stays constant on migration (and
> > > >aborting migration if not possible). This would be an useful feature,
> > > >but can't be enabled by default unless both hosts have the same TSC
> > > >frequency or support TSC scaling.
> > > > 
> > > > Which one(s) you are trying to implement?
> > > >
> > > 
> > > The former. I agree that it's unpredictable if setting vcpu's TSC
> > > frequency to the migrated value is enabled by default (but not in this
> > > patchset). The cpu option 'load-tsc-freq' is introduced to allow users
> > > to enable this behavior if they do know the underlying KVM and CPU
> > > support TSC scaling. In this way, I think the behavior is predictable
> > > as users do know what they are doing.
> > 
> > I'm confused. If load-tsc-freq doesn't abort when TSC scaling isn't
> > available (use case #1), why isn't it enabled by default? On the other
> > hand, if you expect the user to enable it only if the host supports TSC
> > scaling, why doesn't it abort if TSC scaling isn't available?
> >
> 
> Sorry for the confusion. For user case #1, load-tsc-freq is really not
> needed and the migrated TSC frequency should be set if possible
> (ie. if TSC scaling is supported and KVM_SET_TSC_KHZ succeeds). If
> setting TSC frequency fails, the migration will not be aborted.

Agreed.

> 
> > I mean, we can implement both use cases above this way:
> > 
> > 1) If the user didn't ask for anything explicitly:
> >   * If the tsc-freq value is available in the migration stream, try to
> > set it (but don't abort if it can't be set). (use case #1 above)
> > * Rationale: it won't hurt to try to make the VM behave nicely if
> >   possible, without blocking migration if TSC scaling isn't
> >   available.
> > 2) If the user asked for the TSC frequency to be enforced, set it and
> >   abort if it couldn't be set (use case #2 above). This could apply to
> >   both cases:
> >   2.1) If tsc-freq is explicitly set in the command-line.
> > * Rationale: if the user asked for a specific frequency, we
> >   should do what was requested and not ignore errors silently.
> >   2.2) If tsc-freq is available in the migration stream, and the
> > user asked explicitly for it to be enforced.
> > * Rationale: the user is telling us that the incoming tsc-freq
> >   is important, so we shouldn't ignore it silently.
> > * Open question: how should we name the new option?
> >   "load-tsc-freq" would be misleading because it won't be just about
> >   _loading_ tsc-freq (we would be loading it on use case #1, too),
> >   but about making sure it is enforced. "strict-tsc-freq"?
> >   "enforce-tsc-freq"?
> > 
> > We don't need to implement both #1 and #2 at the same time. But if you
> > just want to implement #1 first, I don't see the need for the
> > "load-tsc-freq" option.
> > 
> > On the migration source, we need another option or internal machine flag
> > for #1. I am not sure it should be an user-visible option. If
> > user-visible, I don't know how to name it. "save-tsc-freq" describes it
> > correctly, but it doesn't make its purpose very clear. Any suggestions?
> > It can also 

Re: [PATCH v2 0/3] target-i386: save/restore vcpu's TSC rate during migration

2015-10-23 Thread Eduardo Habkost
On Fri, Oct 23, 2015 at 08:35:20AM -0200, Marcelo Tosatti wrote:
> On Thu, Oct 22, 2015 at 04:45:21PM -0200, Eduardo Habkost wrote:
> > On Tue, Oct 20, 2015 at 03:22:51PM +0800, Haozhong Zhang wrote:
> > > This patchset enables QEMU to save/restore vcpu's TSC rate during the
> > > migration. When cooperating with KVM which supports TSC scaling, guest
> > > programs can observe a consistent guest TSC rate even though they are
> > > migrated among machines with different host TSC rates.
> > > 
> > > A pair of cpu options 'save-tsc-freq' and 'load-tsc-freq' are added to
> > > control the migration of vcpu's TSC rate.
> > 
> > The requirements and goals aren't clear to me. I see two possible use
> > cases, here:
> > 
> > 1) Best effort to keep TSC frequency constant if possible (but not
> >aborting migration if not possible). This would be an interesting
> >default, but a bit unpredictable.
> > 2) Strictly ensuring TSC frequency stays constant on migration (and
> >aborting migration if not possible). This would be an useful feature,
> >but can't be enabled by default unless both hosts have the same TSC
> >frequency or support TSC scaling.
> 
> Only destination needs to support TSC scaling, to match the frequency
> of the incoming host.

True.

> 
> The KVM code for this feature has submitted or integrated? 
> 
> > Which one(s) you are trying to implement?
> > 
> > In other words, what is the right behavior when KVM_SET_TSC_KHZ fails or
> > KVM_CAP_TSC_CONTROL is not available? We can't answer that question if
> > the requirements and goals are not clear.
> > 
> > Once we know what exactly is the goal, we could enable the new mode with
> > a single option, instead of raw options to control migration stream
> > loading/saving.
> 
> Windows and Linux guests have paravirt clocks and/or options to
> disable direct TSC usage for timekeeping purposes. So disabling
> migration seems overkill.

I assume that users who set TSC frequency explicitly in the VM config
care about it (otherwise they wouldn't be setting it explicitly) and
don't want it to change after migration.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 0/3] target-i386: save/restore vcpu's TSC rate during migration

2015-10-23 Thread Eduardo Habkost
On Fri, Oct 23, 2015 at 10:27:27AM +0800, Haozhong Zhang wrote:
> On Thu, Oct 22, 2015 at 04:45:21PM -0200, Eduardo Habkost wrote:
> > On Tue, Oct 20, 2015 at 03:22:51PM +0800, Haozhong Zhang wrote:
> > > This patchset enables QEMU to save/restore vcpu's TSC rate during the
> > > migration. When cooperating with KVM which supports TSC scaling, guest
> > > programs can observe a consistent guest TSC rate even though they are
> > > migrated among machines with different host TSC rates.
> > > 
> > > A pair of cpu options 'save-tsc-freq' and 'load-tsc-freq' are added to
> > > control the migration of vcpu's TSC rate.
> > 
> > The requirements and goals aren't clear to me. I see two possible use
> > cases, here:
> > 
> > 1) Best effort to keep TSC frequency constant if possible (but not
> >aborting migration if not possible). This would be an interesting
> >default, but a bit unpredictable.
> > 2) Strictly ensuring TSC frequency stays constant on migration (and
> >aborting migration if not possible). This would be an useful feature,
> >but can't be enabled by default unless both hosts have the same TSC
> >frequency or support TSC scaling.
> > 
> > Which one(s) you are trying to implement?
> >
> 
> The former. I agree that it's unpredictable if setting vcpu's TSC
> frequency to the migrated value is enabled by default (but not in this
> patchset). The cpu option 'load-tsc-freq' is introduced to allow users
> to enable this behavior if they do know the underlying KVM and CPU
> support TSC scaling. In this way, I think the behavior is predictable
> as users do know what they are doing.

I'm confused. If load-tsc-freq doesn't abort when TSC scaling isn't
available (use case #1), why isn't it enabled by default? On the other
hand, if you expect the user to enable it only if the host supports TSC
scaling, why doesn't it abort if TSC scaling isn't available?

I mean, we can implement both use cases above this way:

1) If the user didn't ask for anything explicitly:
  * If the tsc-freq value is available in the migration stream, try to
set it (but don't abort if it can't be set). (use case #1 above)
* Rationale: it won't hurt to try to make the VM behave nicely if
  possible, without blocking migration if TSC scaling isn't
  available.
2) If the user asked for the TSC frequency to be enforced, set it and
  abort if it couldn't be set (use case #2 above). This could apply to
  both cases:
  2.1) If tsc-freq is explicitly set in the command-line.
* Rationale: if the user asked for a specific frequency, we
  should do what was requested and not ignore errors silently.
  2.2) If tsc-freq is available in the migration stream, and the
user asked explicitly for it to be enforced.
* Rationale: the user is telling us that the incoming tsc-freq
  is important, so we shouldn't ignore it silently.
* Open question: how should we name the new option?
  "load-tsc-freq" would be misleading because it won't be just about
  _loading_ tsc-freq (we would be loading it on use case #1, too),
  but about making sure it is enforced. "strict-tsc-freq"?
  "enforce-tsc-freq"?

We don't need to implement both #1 and #2 at the same time. But if you
just want to implement #1 first, I don't see the need for the
"load-tsc-freq" option.

On the migration source, we need another option or internal machine flag
for #1. I am not sure it should be an user-visible option. If
user-visible, I don't know how to name it. "save-tsc-freq" describes it
correctly, but it doesn't make its purpose very clear. Any suggestions?
It can also be implemented first as an internal machine class flag (set
in pc >= 2.5 only), and possibly become a user-visible option later.

> 
> > In other words, what is the right behavior when KVM_SET_TSC_KHZ fails or
> > KVM_CAP_TSC_CONTROL is not available? We can't answer that question if
> > the requirements and goals are not clear.
> >
> 
> If KVM_CAP_TSC_CONTROL is unavailable, QEMU and KVM will use the host
> TSC frequency as vcpu's TSC frequency.
> 
> If KVM_CAP_TSC_CONTROL is available and KVM_SET_TSC_KHZ fails, the
> setting of TSC frequency will fail and abort either the VM creation
> (this is the case for cpu option 'tsc-freq') or the migration.

I don't see why the lack of KVM_CAP_TSC_CONTROL and failure of
KVM_SET_TSC_KHZ should be treated differently. In both cases it means we
the TSC frequency can't be set.

I mean: if KVM_SET_TSC_KHZ is important enough for the user to make QEMU
abort, it should abort if KVM_CAP_TSC_CONTROL isn't even available. On
the other hand, if the user doesn't care about the lack of
KVM_CAP_TSC_CONTROL (meaning it isn't possible to call KVM_SE

Re: [PATCH v2 3/3] target-i386: load the migrated vcpu's TSC rate

2015-10-23 Thread Eduardo Habkost
On Fri, Oct 23, 2015 at 11:14:48AM +0800, Haozhong Zhang wrote:
> On Thu, Oct 22, 2015 at 04:11:37PM -0200, Eduardo Habkost wrote:
> > On Tue, Oct 20, 2015 at 03:22:54PM +0800, Haozhong Zhang wrote:
> > > Set vcpu's TSC rate to the migrated value (if any). If KVM supports TSC
> > > scaling, guest programs will observe TSC increasing in the migrated rate
> > > other than the host TSC rate.
> > > 
> > > The loading is controlled by a new cpu option 'load-tsc-freq'. If it is
> > > present, then the loading will be enabled and the migrated vcpu's TSC
> > > rate will override the value specified by the cpu option
> > > 'tsc-freq'. Otherwise, the loading will be disabled.
> > 
> > Why do we need an option? Why can't we enable loading unconditionally?
> >
> 
> If TSC scaling is not supported by KVM and CPU, unconditionally
> enabling this loading will not take effect which would be different
> from users' expectation. 'load-tsc-freq' is introduced to allow users
> to enable the loading of migrated TSC frequency if they do know the
> underlying KVM and CPU have TSC scaling support.
> 

I don't get your argument about user expectations. We can't read the
user's mind, but let's enumerate all possible scenarios:

* Host has TSC scaling, user expect TSC frequency to be set:
  * We set it. The user is happy.
* Host has TSC scaling, user doesn't expect TSC frequency to be
  set:
  * We still set it. VM behaves better, guest doesn't see changing TSC
frequency. User didn't expect it but won't be unhappy.
* No TSC scaling, user expect TSC frequency to be set:
  * We won't set it, user will be unhappy. But I believe we all agree
we shouldn't make QEMU abort migration by default on all hosts that
don't support TSC scaling.
* No TSC scaling, user doesn't expect TSC frequency to be set:
  * We don't set it. User is happy.

Could you clarify on which items you disagree above, exactly?

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 0/3] target-i386: save/restore vcpu's TSC rate during migration

2015-10-22 Thread Eduardo Habkost
On Tue, Oct 20, 2015 at 03:22:51PM +0800, Haozhong Zhang wrote:
> This patchset enables QEMU to save/restore vcpu's TSC rate during the
> migration. When cooperating with KVM which supports TSC scaling, guest
> programs can observe a consistent guest TSC rate even though they are
> migrated among machines with different host TSC rates.
> 
> A pair of cpu options 'save-tsc-freq' and 'load-tsc-freq' are added to
> control the migration of vcpu's TSC rate.

The requirements and goals aren't clear to me. I see two possible use
cases, here:

1) Best effort to keep TSC frequency constant if possible (but not
   aborting migration if not possible). This would be an interesting
   default, but a bit unpredictable.
2) Strictly ensuring TSC frequency stays constant on migration (and
   aborting migration if not possible). This would be an useful feature,
   but can't be enabled by default unless both hosts have the same TSC
   frequency or support TSC scaling.

Which one(s) you are trying to implement?

In other words, what is the right behavior when KVM_SET_TSC_KHZ fails or
KVM_CAP_TSC_CONTROL is not available? We can't answer that question if
the requirements and goals are not clear.

Once we know what exactly is the goal, we could enable the new mode with
a single option, instead of raw options to control migration stream
loading/saving.


>  * By default, the migration of vcpu's TSC rate is enabled only on
>pc-*-2.5 and newer machine types. If the cpu option 'save-tsc-freq'
>is present, the vcpu's TSC rate will be migrated from older machine
>types as well.
>  * Another cpu option 'load-tsc-freq' controls whether the migrated
>vcpu's TSC rate is used. By default, QEMU will not use the migrated
>TSC rate if this option is not present. Otherwise, QEMU will use
>the migrated TSC rate and override the TSC rate given by the cpu
>option 'tsc-freq'.
> 
> Changes in v2:
>  * Add a pair of cpu options 'save-tsc-freq' and 'load-tsc-freq' to
>control the migration of vcpu's TSC rate.
>  * Move all logic of setting TSC rate to target-i386.
>  * Remove the duplicated TSC setup in kvm_arch_init_vcpu().
> 
> Haozhong Zhang (3):
>   target-i386: add a subsection for migrating vcpu's TSC rate
>   target-i386: calculate vcpu's TSC rate to be migrated
>   target-i386: load the migrated vcpu's TSC rate
> 
>  include/hw/i386/pc.h  |  5 +
>  target-i386/cpu.c |  2 ++
>  target-i386/cpu.h |  3 +++
>  target-i386/kvm.c | 61 
> +++
>  target-i386/machine.c | 19 
>  5 files changed, 81 insertions(+), 9 deletions(-)
> 
> -- 
> 2.4.8
> 

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/3] target-i386: load the migrated vcpu's TSC rate

2015-10-22 Thread Eduardo Habkost
On Tue, Oct 20, 2015 at 03:22:54PM +0800, Haozhong Zhang wrote:
> Set vcpu's TSC rate to the migrated value (if any). If KVM supports TSC
> scaling, guest programs will observe TSC increasing in the migrated rate
> other than the host TSC rate.
> 
> The loading is controlled by a new cpu option 'load-tsc-freq'. If it is
> present, then the loading will be enabled and the migrated vcpu's TSC
> rate will override the value specified by the cpu option
> 'tsc-freq'. Otherwise, the loading will be disabled.

Why do we need an option? Why can't we enable loading unconditionally?

> 
> The setting of vcpu's TSC rate in this patch duplicates the code in
> kvm_arch_init_vcpu(), so we remove the latter one.
> 
> Signed-off-by: Haozhong Zhang 
> ---
>  target-i386/cpu.c |  1 +
>  target-i386/cpu.h |  1 +
>  target-i386/kvm.c | 28 +++-
>  3 files changed, 21 insertions(+), 9 deletions(-)
> 
> diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> index b6bb457..763ba4b 100644
> --- a/target-i386/cpu.c
> +++ b/target-i386/cpu.c
> @@ -3144,6 +3144,7 @@ static Property x86_cpu_properties[] = {
>  DEFINE_PROP_BOOL("enforce", X86CPU, enforce_cpuid, false),
>  DEFINE_PROP_BOOL("kvm", X86CPU, expose_kvm, true),
>  DEFINE_PROP_BOOL("save-tsc-freq", X86CPU, env.save_tsc_khz, true),
> +DEFINE_PROP_BOOL("load-tsc-freq", X86CPU, env.load_tsc_khz, false),
>  DEFINE_PROP_UINT32("level", X86CPU, env.cpuid_level, 0),
>  DEFINE_PROP_UINT32("xlevel", X86CPU, env.cpuid_xlevel, 0),
>  DEFINE_PROP_UINT32("xlevel2", X86CPU, env.cpuid_xlevel2, 0),
> diff --git a/target-i386/cpu.h b/target-i386/cpu.h
> index ba1a289..353f5fb 100644
> --- a/target-i386/cpu.h
> +++ b/target-i386/cpu.h
> @@ -968,6 +968,7 @@ typedef struct CPUX86State {
>  int64_t tsc_khz;
>  int64_t tsc_khz_incoming;
>  bool save_tsc_khz;
> +bool load_tsc_khz;
>  void *kvm_xsave_buf;
>  
>  uint64_t mcg_cap;
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 698524a..34616f5 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -743,15 +743,6 @@ int kvm_arch_init_vcpu(CPUState *cs)
>  return r;
>  }
>  
> -r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL);
> -if (r && env->tsc_khz) {
> -r = kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz);
> -if (r < 0) {
> -fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");
> -return r;
> -}
> -}
> -
>  if (kvm_has_xsave()) {
>  env->kvm_xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
>  }
> @@ -2223,6 +2214,25 @@ static int kvm_setup_tsc_khz(X86CPU *cpu, int level)
>  return 0;
>  
>  /*
> + * If the cpu option 'load-tsc-freq' is present, the vcpu's TSC rate in 
> the
> + * migrated state will be used and the overrides the user-specified 
> vcpu's
> + * TSC rate (if any).
> + */
> +if (runstate_check(RUN_STATE_INMIGRATE) &&
> +env->load_tsc_khz && env->tsc_khz_incoming) {
> +env->tsc_khz = env->tsc_khz_incoming;
> +}

Please don't make the results of the function depend on global QEMU
runstate, as it makes it harder to reason about it, and easy to
introduce subtle bugs if we change initialization order. Can't we just
ensure tsc_khz gets set to the right value before the function is
called, inside the code that loads migration data?

> +
> +r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL);
> +if (r && env->tsc_khz) {
> +r = kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz);
> +if (r < 0) {
> +fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");
> +return r;
> +}
> +}

So, the final result here does not depend on the configuration, but also
on host capabilities. That means nobody can possibly know if the
tsc-freq option really works, until they enable it, run the VM, and
check the results from inside the VM. Not a good idea.

(This doesn't apply just to the new code, the existing code is already
broken this way.)

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 27/32] nvdimm: support DSM_CMD_IMPLEMENTED function

2015-10-14 Thread Eduardo Habkost
On Wed, Oct 14, 2015 at 10:50:40PM +0800, Xiao Guangrong wrote:
> On 10/14/2015 05:40 PM, Stefan Hajnoczi wrote:
> >On Sun, Oct 11, 2015 at 11:52:59AM +0800, Xiao Guangrong wrote:
> >>  static void dsm_write(void *opaque, hwaddr addr,
> >>uint64_t val, unsigned size)
> >>  {
> >>+NVDIMMState *state = opaque;
> >>+MemoryRegion *dsm_ram_mr;
> >>+dsm_in *in;
> >>+dsm_out *out;
> >>+uint32_t revision, function, handle;
> >>+
> >>  if (val != NOTIFY_VALUE) {
> >>  fprintf(stderr, "BUG: unexepected notify value 0x%" PRIx64, val);
> >>  }
> >>+
> >>+dsm_ram_mr = memory_region_find(>mr, state->page_size,
> >>+state->page_size).mr;
> >>+memory_region_unref(dsm_ram_mr);
> >>+in = memory_region_get_ram_ptr(dsm_ram_mr);
> >
> >This looks suspicious.  Shouldn't the memory_region_unref(dsm_ram_mr)
> >happen after we're done using it?
> 
> This region is keep-alive during QEMU's running, it is okay.  The same
> style is applied to other codes, for example: line 208 in
> hw/s390x/sclp.c.

In sclp.c (assign_storage()), the memory region is never used after
memory_region_unref() is called. In unassign_storage(), sclp.c owns an
additional reference, grabbed by assign_storage().

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] kvm-all: notice KVM of vcpu's TSC rate after migration

2015-09-30 Thread Eduardo Habkost
On Wed, Sep 30, 2015 at 08:32:26AM +0800, Haozhong Zhang wrote:
> > [...]
> > > > Or maybe we shouldn't treat this as VM state, but as configuration, and
> > > > let management configure the TSC frequency explicitly if the user really
> > > > needs it to stay the same during migration.
> > > >
> > > > (CCing libvir-list to see if they have feedback)
> > > >
> > > 
> > > Thanks for CC. I'll consider to add a command line option to control
> > > the configuration of guest TSC fequency.
> > 
> > It already exists, -cpu has a "tsc-freq" option.
> >
> 
> What I'm considering is to add a "-keep-tsc-freq" option to control
> whether the TSC frequency should be migrated if "tsc-freq" is not
> presented. Compared to "tsc-freq", "-keep-tsc-freq" can free users
> from figuring out the guest TSC frequency manually in the migration.

If you do that, please make it a property of the CPU object, so it will
can be set as a "-cpu" option.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] kvm-all: notice KVM of vcpu's TSC rate after migration

2015-09-29 Thread Eduardo Habkost
On Tue, Sep 29, 2015 at 11:43:34AM +0800, Haozhong Zhang wrote:
> On Mon, Sep 28, 2015 at 01:37:34PM -0300, Eduardo Habkost wrote:
> > On Mon, Sep 28, 2015 at 01:38:31PM +0800, Haozhong Zhang wrote:
[...]
> > >  static void do_kvm_cpu_synchronize_post_init(void *arg)
> > >  {
> > >  CPUState *cpu = arg;
> > > +CPUX86State *env = _CPU(cpu)->env;
> > > +int r;
> > > +
> > > +/*
> > > + * XXX: KVM_SET_TSC_KHZ must be done before kvm_arch_put_registers().
> > 
> > Could you explain where this requirement comes from?
> >
> 
> kvm_arch_put_registers() below will setup vcpu's MSR_IA32_TSC through
> KVM ioctl KVM_SET_MSRS. KVM needs to know vcpu's TSC rate so as to
> correctly scale the TSC value given by QEMU, especially when vcpu's
> TSC rate is different than the host TSC rate (e.g. it's migrated from
> another machine w/ different host TSC rate than the current one).

Thanks. The comment above could contain a short version of this
explanation, e.g.: "KVM needs KVM_SET_TSC_KHZ to be done before
KVM_SET_MSRS sets MSR_IA32_TSC (done by kvm_arch_put_registers())".

> 
[...]
> > Or maybe we shouldn't treat this as VM state, but as configuration, and
> > let management configure the TSC frequency explicitly if the user really
> > needs it to stay the same during migration.
> >
> > (CCing libvir-list to see if they have feedback)
> >
> 
> Thanks for CC. I'll consider to add a command line option to control
> the configuration of guest TSC fequency.

It already exists, -cpu has a "tsc-freq" option.

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] kvm-all: notice KVM of vcpu's TSC rate after migration

2015-09-28 Thread Eduardo Habkost
On Mon, Sep 28, 2015 at 01:38:31PM +0800, Haozhong Zhang wrote:
> When a vcpu is created in KVM, its TSC rate is initially identical to
> the host TSC rate. If its state is migrated to a vcpu on another
> machine (target machine) which may uses a different host TSC rate, QEMU
> on the target machine should notice KVM of the migrated vcpu's TSC
> rate. In case that KVM on the target machine supports TSC scaling, guest
> programs running on the migrated vcpu will observe the same TSC rate
> before and after the migration.
> 
> Signed-off-by: Haozhong Zhang 
> ---
>  kvm-all.c | 13 +
>  1 file changed, 13 insertions(+)
> 
> diff --git a/kvm-all.c b/kvm-all.c
> index 0be4615..e8de038 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -1769,6 +1769,19 @@ void kvm_cpu_synchronize_post_reset(CPUState *cpu)
>  static void do_kvm_cpu_synchronize_post_init(void *arg)
>  {
>  CPUState *cpu = arg;
> +CPUX86State *env = _CPU(cpu)->env;
> +int r;
> +
> +/*
> + * XXX: KVM_SET_TSC_KHZ must be done before kvm_arch_put_registers().

Could you explain where this requirement comes from?

> + */
> +r = kvm_check_extension(cpu->kvm_state, KVM_CAP_TSC_CONTROL);
> +if (r && env->tsc_khz) {
> +r = kvm_vcpu_ioctl(cpu, KVM_SET_TSC_KHZ, env->tsc_khz);
> +if (r < 0) {
> +fprintf(stderr, "KVM_SET_TSC_KHZ failed\n");
> +}
> +}

This is duplicating the existing KVM_SET_TSC_KHZ call at
kvm_arch_init_vcpu(). I wonder if there's a way to avoid this
duplication. Should we set TSC KHz only at
do_kvm_cpu_synchronize_post_init(), and remove the call from
kvm_arch_init_vcpu()?

Or maybe we shouldn't treat this as VM state, but as configuration, and
let management configure the TSC frequency explicitly if the user really
needs it to stay the same during migration.

(CCing libvir-list to see if they have feedback)

-- 
Eduardo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   3   4   5   6   7   >