On Fri, Jun 18, 2021 at 04:48:14AM +0000, Jing Zhang wrote:
> This commit defines the API for userspace and prepare the common
> functionalities to support per VM/VCPU binary stats data readings.
>
> The KVM stats now is only accessible by debugfs, which has some
> shortcomings this change series are supposed to fix:
> 1. The current debugfs stats solution in KVM could be disabled
> when kernel Lockdown mode is enabled, which is a potential
> rick for production.
> 2. The current debugfs stats solution in KVM is organized as "one
> stats per file", it is good for debugging, but not efficient
> for production.
> 3. The stats read/clear in current debugfs solution in KVM are
> protected by the global kvm_lock.
>
> Besides that, there are some other benefits with this change:
> 1. All KVM VM/VCPU stats can be read out in a bulk by one copy
> to userspace.
> 2. A schema is used to describe KVM statistics. From userspace's
> perspective, the KVM statistics are self-describing.
> 3. With the fd-based solution, a separate telemetry would be able
> to read KVM stats in a less privileged environment.
> 4. After the initial setup by reading in stats descriptors, a
> telemetry only needs to read the stats data itself, no more
> parsing or setup is needed.
>
> Reviewed-by: David Matlack <[email protected]>
> Reviewed-by: Ricardo Koller <[email protected]>
> Reviewed-by: Krish Sadhukhan <[email protected]>
> Reviewed-by: Fuad Tabba <[email protected]>
> Tested-by: Fuad Tabba <[email protected]> #arm64
> Signed-off-by: Jing Zhang <[email protected]>
> ---
> arch/arm64/kvm/Makefile | 2 +-
> arch/mips/kvm/Makefile | 2 +-
> arch/powerpc/kvm/Makefile | 2 +-
> arch/s390/kvm/Makefile | 3 +-
> arch/x86/kvm/Makefile | 2 +-
> include/linux/kvm_host.h | 145 ++++++++++++++++++++++++++++++++++++++
> include/uapi/linux/kvm.h | 42 +++++++++++
> virt/kvm/binary_stats.c | 130 ++++++++++++++++++++++++++++++++++
> 8 files changed, 323 insertions(+), 5 deletions(-)
> create mode 100644 virt/kvm/binary_stats.c
>
> diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
> index 589921392cb1..989bb5dad2c8 100644
> --- a/arch/arm64/kvm/Makefile
> +++ b/arch/arm64/kvm/Makefile
> @@ -11,7 +11,7 @@ obj-$(CONFIG_KVM) += kvm.o
> obj-$(CONFIG_KVM) += hyp/
>
> kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
> - $(KVM)/vfio.o $(KVM)/irqchip.o \
> + $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \
> arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
> inject_fault.o va_layout.o handle_exit.o \
> guest.o debug.o reset.o sys_regs.o \
> diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
> index 30cc060857c7..c67250a956b8 100644
> --- a/arch/mips/kvm/Makefile
> +++ b/arch/mips/kvm/Makefile
> @@ -2,7 +2,7 @@
> # Makefile for KVM support for MIPS
> #
>
> -common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o
> eventfd.o)
> +common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o
> eventfd.o binary_stats.o)
>
> EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
>
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index 2bfeaa13befb..b347d043b932 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -6,7 +6,7 @@
> ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
> KVM := ../../../virt/kvm
>
> -common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
> +common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o
> common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
> common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
>
> diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
> index 12decca22e7c..b3aaadc60ead 100644
> --- a/arch/s390/kvm/Makefile
> +++ b/arch/s390/kvm/Makefile
> @@ -4,7 +4,8 @@
> # Copyright IBM Corp. 2008
>
> KVM := ../../../virt/kvm
> -common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o
> $(KVM)/irqchip.o $(KVM)/vfio.o
> +common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o \
> + $(KVM)/irqchip.o $(KVM)/vfio.o $(KVM)/binary_stats.o
>
> ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
>
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index 83331376b779..75dfd27b6e8a 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -11,7 +11,7 @@ KVM := ../../../virt/kvm
>
> kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
> $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
> \
> - $(KVM)/dirty_ring.o
> + $(KVM)/dirty_ring.o $(KVM)/binary_stats.o
> kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
>
> kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 5a31e0696360..2f0d12064ae7 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1272,6 +1272,12 @@ struct kvm_stats_debugfs_item {
> int mode;
> };
>
> +#define KVM_STATS_NAME_LEN 48
> +struct _kvm_stats_desc {
> + struct kvm_stats_desc desc;
> + char name[KVM_STATS_NAME_LEN];
> +};
> +
> #define KVM_DBGFS_GET_MODE(dbgfs_item)
> \
> ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644)
>
> @@ -1285,8 +1291,147 @@ struct kvm_stats_debugfs_item {
> { n, offsetof(struct kvm_vcpu, stat.generic.x), \
> KVM_STAT_VCPU, ## __VA_ARGS__ }
>
> +#define STATS_DESC_COMMON(type, unit, base, exp) \
> + .flags = type | unit | base | \
> + BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \
> + BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \
> + BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \
> + .exponent = exp, \
> + .size = 1
> +
> +#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp) \
> + { \
> + { \
> + STATS_DESC_COMMON(type, unit, base, exp), \
> + .offset = offsetof(struct kvm_vm_stat, generic.stat) \
> + }, \
> + .name = #stat, \
> + }
> +#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp) \
> + { \
> + { \
> + STATS_DESC_COMMON(type, unit, base, exp), \
> + .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \
> + }, \
> + .name = #stat, \
> + }
> +#define VM_STATS_DESC(stat, type, unit, base, exp) \
> + { \
> + { \
> + STATS_DESC_COMMON(type, unit, base, exp), \
> + .offset = offsetof(struct kvm_vm_stat, stat) \
> + }, \
> + .name = #stat, \
> + }
> +#define VCPU_STATS_DESC(stat, type, unit, base, exp) \
> + { \
> + { \
> + STATS_DESC_COMMON(type, unit, base, exp), \
> + .offset = offsetof(struct kvm_vcpu_stat, stat) \
> + }, \
> + .name = #stat, \
> + }
> +/* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */
> +#define STATS_DESC(SCOPE, stat, type, unit, base, exp)
> \
> + SCOPE##_STATS_DESC(stat, type, unit, base, exp)
> +
> +#define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent) \
> + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, \
> + unit, base, exponent)
> +#define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent)
> \
> + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, unit, base, exponent) \
> +
> +/* Cumulative counter */
> +#define STATS_DESC_COUNTER(SCOPE, name)
> \
> + STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_NONE, \
> + KVM_STATS_BASE_POW10, 0)
> +/* Instantaneous counter */
> +#define STATS_DESC_ICOUNTER(SCOPE, name) \
> + STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_NONE, \
> + KVM_STATS_BASE_POW10, 0)
> +
> +/* Cumulative clock cycles */
> +#define STATS_DESC_CYCLE(SCOPE, name)
> \
> + STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_CYCLES, \
> + KVM_STATS_BASE_POW10, 0)
> +/* Instantaneous clock cycles */
> +#define STATS_DESC_ICYCLE(SCOPE, name)
> \
> + STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_CYCLES, \
> + KVM_STATS_BASE_POW10, 0)
> +
> +/* Cumulative memory size in Byte */
> +#define STATS_DESC_SIZE_BYTE(SCOPE, name) \
> + STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES, \
> + KVM_STATS_BASE_POW2, 0)
> +/* Cumulative memory size in KiByte */
> +#define STATS_DESC_SIZE_KBYTE(SCOPE, name) \
> + STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES, \
> + KVM_STATS_BASE_POW2, 10)
> +/* Cumulative memory size in MiByte */
> +#define STATS_DESC_SIZE_MBYTE(SCOPE, name) \
> + STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES, \
> + KVM_STATS_BASE_POW2, 20)
> +/* Cumulative memory size in GiByte */
> +#define STATS_DESC_SIZE_GBYTE(SCOPE, name) \
> + STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_BYTES, \
> + KVM_STATS_BASE_POW2, 30)
> +
> +/* Instantaneous memory size in Byte */
> +#define STATS_DESC_ISIZE_BYTE(SCOPE, name) \
> + STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES, \
> + KVM_STATS_BASE_POW2, 0)
> +/* Instantaneous memory size in KiByte */
> +#define STATS_DESC_ISIZE_KBYTE(SCOPE, name) \
> + STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES, \
> + KVM_STATS_BASE_POW2, 10)
> +/* Instantaneous memory size in MiByte */
> +#define STATS_DESC_ISIZE_MBYTE(SCOPE, name) \
> + STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES, \
> + KVM_STATS_BASE_POW2, 20)
> +/* Instantaneous memory size in GiByte */
> +#define STATS_DESC_ISIZE_GBYTE(SCOPE, name) \
> + STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BYTES, \
> + KVM_STATS_BASE_POW2, 30)
> +
> +/* Cumulative time in second */
> +#define STATS_DESC_TIME_SEC(SCOPE, name) \
> + STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \
> + KVM_STATS_BASE_POW10, 0)
> +/* Cumulative time in millisecond */
> +#define STATS_DESC_TIME_MSEC(SCOPE, name) \
> + STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \
> + KVM_STATS_BASE_POW10, -3)
> +/* Cumulative time in microsecond */
> +#define STATS_DESC_TIME_USEC(SCOPE, name) \
> + STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \
> + KVM_STATS_BASE_POW10, -6)
> +/* Cumulative time in nanosecond */
> +#define STATS_DESC_TIME_NSEC(SCOPE, name) \
> + STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \
> + KVM_STATS_BASE_POW10, -9)
> +
> +/* Instantaneous time in second */
> +#define STATS_DESC_ITIME_SEC(SCOPE, name) \
> + STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS, \
> + KVM_STATS_BASE_POW10, 0)
> +/* Instantaneous time in millisecond */
> +#define STATS_DESC_ITIME_MSEC(SCOPE, name) \
> + STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS, \
> + KVM_STATS_BASE_POW10, -3)
> +/* Instantaneous time in microsecond */
> +#define STATS_DESC_ITIME_USEC(SCOPE, name) \
> + STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS, \
> + KVM_STATS_BASE_POW10, -6)
> +/* Instantaneous time in nanosecond */
> +#define STATS_DESC_ITIME_NSEC(SCOPE, name) \
> + STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_SECONDS, \
> + KVM_STATS_BASE_POW10, -9)
> +
> extern struct kvm_stats_debugfs_item debugfs_entries[];
> extern struct dentry *kvm_debugfs_dir;
> +ssize_t kvm_stats_read(char *id, struct kvm_stats_header *header,
> + struct _kvm_stats_desc *desc, void *stats, size_t size_stats,
> + char __user *user_buffer, size_t size, loff_t *offset);
>
> #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
> static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 9febe1412f7a..ab73e905105c 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1086,6 +1086,7 @@ struct kvm_ppc_resize_hpt {
> #define KVM_CAP_HYPERV_ENFORCE_CPUID 199
> #define KVM_CAP_SREGS2 200
> #define KVM_CAP_EXIT_HYPERCALL 201
> +#define KVM_CAP_BINARY_STATS_FD 202
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
> @@ -1905,4 +1906,45 @@ struct kvm_dirty_gfn {
> #define KVM_BUS_LOCK_DETECTION_OFF (1 << 0)
> #define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1)
>
> +#define KVM_STATS_ID_MAXLEN 64
> +
> +struct kvm_stats_header {
> + __u32 name_size;
> + __u32 count;
> + __u32 desc_offset;
> + __u32 data_offset;
> + char id[];
> +};
You mentioned before that the size of this really is the size of the
structure + KVM_STATS_ID_MAXLEN, right? Or is it - KVM_STATS_ID_MAXLEN?
If so, why not put that value explicitly in:
char id[THE_REST_OF_THE_HEADER_SPACE];
As this is not a variable header size at all, and you can not change it
going forward, so the variable length array here feels disingenuous.
> +
> +#define KVM_STATS_TYPE_SHIFT 0
> +#define KVM_STATS_TYPE_MASK (0xF << KVM_STATS_TYPE_SHIFT)
> +#define KVM_STATS_TYPE_CUMULATIVE (0x0 << KVM_STATS_TYPE_SHIFT)
> +#define KVM_STATS_TYPE_INSTANT (0x1 << KVM_STATS_TYPE_SHIFT)
> +#define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_INSTANT
> +
> +#define KVM_STATS_UNIT_SHIFT 4
> +#define KVM_STATS_UNIT_MASK (0xF << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_NONE (0x0 << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT)
> +#define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES
> +
> +#define KVM_STATS_BASE_SHIFT 8
> +#define KVM_STATS_BASE_MASK (0xF << KVM_STATS_BASE_SHIFT)
> +#define KVM_STATS_BASE_POW10 (0x0 << KVM_STATS_BASE_SHIFT)
> +#define KVM_STATS_BASE_POW2 (0x1 << KVM_STATS_BASE_SHIFT)
> +#define KVM_STATS_BASE_MAX KVM_STATS_BASE_POW2
> +
> +struct kvm_stats_desc {
> + __u32 flags;
> + __s16 exponent;
> + __u16 size;
> + __u32 offset;
> + __u32 unused;
> + char name[];
> +};
What is the max length of name?
Why aren't these structures defined here in kerneldoc so that we can
understand them better? Putting them in a .rst file guarantees they
will get out of sync, and you can always directly import the kerneldoc
into the .rst file.
thanks,
greg k-h
_______________________________________________
kvmarm mailing list
[email protected]
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm