[Devel] [PATCH vz7, vz8 1/1] kvm: fix AMD IBRS/IBPB/STIBP/SSBD reporting #PSBM-120787

2020-10-22 Thread Denis V. Lunev
We should report these bits in 8008 EBX on AMD only, i.e. when AMD
specific feature bits are enabled.

Signed-off-by: Denis V. Lunev 
CC: Vasily Averin 
CC: Konstantin Khorenko 
---
 arch/x86/kvm/cpuid.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index fe8b92723990..05898112a306 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -641,13 +641,13 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 * arch/x86/kernel/cpu/bugs.c is kind enough to
 * record that in cpufeatures so use them.
 */
-   if (boot_cpu_has(X86_FEATURE_IBPB))
+   if (boot_cpu_has(X86_FEATURE_AMD_IBPB))
entry->ebx |= F(AMD_IBPB);
-   if (boot_cpu_has(X86_FEATURE_IBRS))
+   if (boot_cpu_has(X86_FEATURE_AMD_IBRS))
entry->ebx |= F(AMD_IBRS);
-   if (boot_cpu_has(X86_FEATURE_STIBP))
+   if (boot_cpu_has(X86_FEATURE_AMD_STIBP))
entry->ebx |= F(AMD_STIBP);
-   if (boot_cpu_has(X86_FEATURE_SSBD))
+   if (boot_cpu_has(X86_FEATURE_AMD_SSBD))
entry->ebx |= F(AMD_SSBD);
if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
entry->ebx |= F(AMD_SSB_NO);
-- 
2.17.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH vz8 2/4] ia32: add 32-bit vdso virtualization.

2020-10-22 Thread Andrey Ryabinin
Similarly to the 64-bit vdso, make 32-bit vdso mapping per-ve.
This will allow per container modification of the linux version
xin .note section of vdso and monotonic time.

https://jira.sw.ru/browse/PSBM-121668
Signed-off-by: Andrey Ryabinin 
---
 arch/x86/entry/vdso/vma.c|  4 ++--
 arch/x86/kernel/process_64.c |  2 +-
 include/linux/ve.h   |  1 +
 kernel/ve/ve.c   | 35 +--
 4 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index c48deffc1473..538c6730f436 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -56,7 +56,7 @@ static void vdso_fix_landing(const struct vdso_image *image,
struct vm_area_struct *new_vma)
 {
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-   if (in_ia32_syscall() && image == _image_32) {
+   if (in_ia32_syscall() && image == get_exec_env()->vdso_32) {
struct pt_regs *regs = current_pt_regs();
unsigned long vdso_land = image->sym_int80_landing_pad;
unsigned long old_land_addr = vdso_land +
@@ -281,7 +281,7 @@ static int load_vdso32(void)
if (vdso32_enabled != 1)  /* Other values all mean "disabled" */
return 0;
 
-   return map_vdso(_image_32, 0);
+   return map_vdso(get_exec_env()->vdso_32, 0);
 }
 #endif
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a010d4b9d126..22215141 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -686,7 +686,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, 
unsigned long arg2)
 # endif
 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
case ARCH_MAP_VDSO_32:
-   return prctl_map_vdso(_image_32, arg2);
+   return prctl_map_vdso(get_exec_env()->vdso_32, arg2);
 # endif
case ARCH_MAP_VDSO_64:
return prctl_map_vdso(get_exec_env()->vdso_64, arg2);
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 0e85a4032c3a..5b1962ff4c66 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -95,6 +95,7 @@ struct ve_struct {
struct cn_private   *cn;
 #endif
struct vdso_image   *vdso_64;
+   struct vdso_image   *vdso_32;
 };
 
 #define VE_MEMINFO_DEFAULT 1   /* default behaviour */
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 186deb3f88f4..03b8d126a0ed 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -58,6 +58,7 @@ struct ve_struct ve0 = {
.netns_max_nr   = INT_MAX,
.meminfo_val= VE_MEMINFO_SYSTEM,
.vdso_64= (struct vdso_image*)_image_64,
+   .vdso_32= (struct vdso_image*)_image_32,
 };
 EXPORT_SYMBOL(ve0);
 
@@ -540,13 +541,12 @@ static __u64 ve_setup_iptables_mask(__u64 init_mask)
 }
 #endif
 
-static int copy_vdso(struct ve_struct *ve)
+static int copy_vdso(struct vdso_image **vdso_dst, const struct vdso_image 
*vdso_src)
 {
-   const struct vdso_image *vdso_src = _image_64;
struct vdso_image *vdso;
void *vdso_data;
 
-   if (ve->vdso_64)
+   if (*vdso_dst)
return 0;
 
vdso = kmemdup(vdso_src, sizeof(*vdso), GFP_KERNEL);
@@ -563,10 +563,22 @@ static int copy_vdso(struct ve_struct *ve)
 
vdso->data = vdso_data;
 
-   ve->vdso_64 = vdso;
+   *vdso_dst = vdso;
return 0;
 }
 
+static void ve_free_vdso(struct ve_struct *ve)
+{
+   if (ve->vdso_64 && ve->vdso_64 != _image_64) {
+   kfree(ve->vdso_64->data);
+   kfree(ve->vdso_64);
+   }
+   if (ve->vdso_32 && ve->vdso_32 != _image_32) {
+   kfree(ve->vdso_32->data);
+   kfree(ve->vdso_32);
+   }
+}
+
 static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state 
*parent_css)
 {
struct ve_struct *ve = 
@@ -592,7 +604,10 @@ static struct cgroup_subsys_state *ve_create(struct 
cgroup_subsys_state *parent_
if (err)
goto err_log;
 
-   if (copy_vdso(ve))
+   if (copy_vdso(>vdso_64, _image_64))
+   goto err_vdso;
+
+   if (copy_vdso(>vdso_32, _image_32))
goto err_vdso;
 
ve->features = VE_FEATURES_DEF;
@@ -619,6 +634,7 @@ static struct cgroup_subsys_state *ve_create(struct 
cgroup_subsys_state *parent_
return >css;
 
 err_vdso:
+   ve_free_vdso(ve);
ve_log_destroy(ve);
 err_log:
free_percpu(ve->sched_lat_ve.cur);
@@ -658,15 +674,6 @@ static void ve_offline(struct cgroup_subsys_state *css)
ve->ve_name = NULL;
 }
 
-static void ve_free_vdso(struct ve_struct *ve)
-{
-   if (ve->vdso_64 == _image_64)
-   return;
-
-   kfree(ve->vdso_64->data);
-   kfree(ve->vdso_64);
-}
-
 static void ve_destroy(struct cgroup_subsys_state *css)
 {
struct ve_struct *ve = css_to_ve(css);
-- 
2.26.2


[Devel] [PATCH vz8 3/4] ve: patch linux_version_code in vdso

2020-10-22 Thread Andrey Ryabinin
On the write to ve.os_release file patch the linux_version_code
in the .note section of vdso.

https://jira.sw.ru/browse/PSBM-121668
Signed-off-by: Andrey Ryabinin 
---
 arch/x86/entry/vdso/vdso-note.S   | 2 ++
 arch/x86/entry/vdso/vdso2c.c  | 1 +
 arch/x86/entry/vdso/vdso32/note.S | 2 ++
 arch/x86/include/asm/vdso.h   | 1 +
 kernel/ve/ve.c| 7 +++
 5 files changed, 13 insertions(+)

diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S
index 79a071e4357e..c0e6e65f9fec 100644
--- a/arch/x86/entry/vdso/vdso-note.S
+++ b/arch/x86/entry/vdso/vdso-note.S
@@ -7,6 +7,8 @@
 #include 
 #include 
 
+   .globl linux_version_code
 ELFNOTE_START(Linux, 0, "a")
+linux_version_code:
.long LINUX_VERSION_CODE
 ELFNOTE_END
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 4674f58581a1..7fab0bd96ac1 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -109,6 +109,7 @@ struct vdso_sym required_syms[] = {
{"__kernel_sigreturn", true},
{"__kernel_rt_sigreturn", true},
{"int80_landing_pad", true},
+   {"linux_version_code", true},
 };
 
 __attribute__((format(printf, 1, 2))) __attribute__((noreturn))
diff --git a/arch/x86/entry/vdso/vdso32/note.S 
b/arch/x86/entry/vdso/vdso32/note.S
index 9fd51f206314..096b62f14863 100644
--- a/arch/x86/entry/vdso/vdso32/note.S
+++ b/arch/x86/entry/vdso/vdso32/note.S
@@ -10,7 +10,9 @@
 /* Ideally this would use UTS_NAME, but using a quoted string here
doesn't work. Remember to change this when changing the
kernel's name. */
+   .globl linux_version_code
 ELFNOTE_START(Linux, 0, "a")
+linux_version_code:
.long LINUX_VERSION_CODE
 ELFNOTE_END
 
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 27566e57e87d..92c7ac06828e 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -27,6 +27,7 @@ struct vdso_image {
long sym___kernel_rt_sigreturn;
long sym___kernel_vsyscall;
long sym_int80_landing_pad;
+   long sym_linux_version_code;
 };
 
 #ifdef CONFIG_X86_64
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 03b8d126a0ed..98c2e7e3d2c6 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -954,6 +954,7 @@ static ssize_t ve_os_release_write(struct kernfs_open_file 
*of, char *buf,
 {
struct cgroup_subsys_state *css = of_css(of);
struct ve_struct *ve = css_to_ve(css);
+   int n1, n2, n3, new_version;
char *release;
int ret = 0;
 
@@ -964,6 +965,12 @@ static ssize_t ve_os_release_write(struct kernfs_open_file 
*of, char *buf,
goto up_opsem;
}
 
+   if (sscanf(buf, "%d.%d.%d", , , ) == 3) {
+   new_version = ((n1 << 16) + (n2 << 8)) + n3;
+   *((int *)(ve->vdso_64->data + 
ve->vdso_64->sym_linux_version_code)) = new_version;
+   *((int *)(ve->vdso_32->data + 
ve->vdso_32->sym_linux_version_code)) = new_version;
+   }
+
down_write(_sem);
release = ve->ve_ns->uts_ns->name.release;
strncpy(release, buf, __NEW_UTS_LEN);
-- 
2.26.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH vz8 1/4] ve, x86_64: add per-ve vdso mapping.

2020-10-22 Thread Andrey Ryabinin
Make vdso mapping per-ve. This will allow per container modification
of the linux version in .note section of vdso and monotonic time.

https://jira.sw.ru/browse/PSBM-121668
Signed-off-by: Andrey Ryabinin 
---
 arch/x86/entry/vdso/vma.c|  3 ++-
 arch/x86/kernel/process_64.c |  2 +-
 include/linux/ve.h   |  2 ++
 kernel/ve/ve.c   | 43 
 4 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index eb3d85f87884..c48deffc1473 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -291,7 +291,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, 
int uses_interp)
if (!vdso64_enabled)
return 0;
 
-   return map_vdso_randomized(_image_64);
+
+   return map_vdso_randomized(get_exec_env()->vdso_64);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c1c8d66cbe70..a010d4b9d126 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -689,7 +689,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, 
unsigned long arg2)
return prctl_map_vdso(_image_32, arg2);
 # endif
case ARCH_MAP_VDSO_64:
-   return prctl_map_vdso(_image_64, arg2);
+   return prctl_map_vdso(get_exec_env()->vdso_64, arg2);
 #endif
 
default:
diff --git a/include/linux/ve.h b/include/linux/ve.h
index ec7dc522ac1f..0e85a4032c3a 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct nsproxy;
 struct veip_struct;
@@ -93,6 +94,7 @@ struct ve_struct {
 #ifdef CONFIG_CONNECTOR
struct cn_private   *cn;
 #endif
+   struct vdso_image   *vdso_64;
 };
 
 #define VE_MEMINFO_DEFAULT 1   /* default behaviour */
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index cc26d3b2fa9b..186deb3f88f4 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -57,6 +57,7 @@ struct ve_struct ve0 = {
.netns_avail_nr = ATOMIC_INIT(INT_MAX),
.netns_max_nr   = INT_MAX,
.meminfo_val= VE_MEMINFO_SYSTEM,
+   .vdso_64= (struct vdso_image*)_image_64,
 };
 EXPORT_SYMBOL(ve0);
 
@@ -539,6 +540,33 @@ static __u64 ve_setup_iptables_mask(__u64 init_mask)
 }
 #endif
 
+static int copy_vdso(struct ve_struct *ve)
+{
+   const struct vdso_image *vdso_src = _image_64;
+   struct vdso_image *vdso;
+   void *vdso_data;
+
+   if (ve->vdso_64)
+   return 0;
+
+   vdso = kmemdup(vdso_src, sizeof(*vdso), GFP_KERNEL);
+   if (!vdso)
+   return -ENOMEM;
+
+   vdso_data = kmalloc(vdso_src->size, GFP_KERNEL);
+   if (!vdso_data) {
+   kfree(vdso);
+   return -ENOMEM;
+   }
+
+   memcpy(vdso_data, vdso_src->data, vdso_src->size);
+
+   vdso->data = vdso_data;
+
+   ve->vdso_64 = vdso;
+   return 0;
+}
+
 static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state 
*parent_css)
 {
struct ve_struct *ve = 
@@ -564,6 +592,9 @@ static struct cgroup_subsys_state *ve_create(struct 
cgroup_subsys_state *parent_
if (err)
goto err_log;
 
+   if (copy_vdso(ve))
+   goto err_vdso;
+
ve->features = VE_FEATURES_DEF;
ve->_randomize_va_space = ve0._randomize_va_space;
 
@@ -587,6 +618,8 @@ static struct cgroup_subsys_state *ve_create(struct 
cgroup_subsys_state *parent_
 
return >css;
 
+err_vdso:
+   ve_log_destroy(ve);
 err_log:
free_percpu(ve->sched_lat_ve.cur);
 err_lat:
@@ -625,12 +658,22 @@ static void ve_offline(struct cgroup_subsys_state *css)
ve->ve_name = NULL;
 }
 
+static void ve_free_vdso(struct ve_struct *ve)
+{
+   if (ve->vdso_64 == _image_64)
+   return;
+
+   kfree(ve->vdso_64->data);
+   kfree(ve->vdso_64);
+}
+
 static void ve_destroy(struct cgroup_subsys_state *css)
 {
struct ve_struct *ve = css_to_ve(css);
 
kmapset_unlink(>sysfs_perms_key, _ve_perms_set);
ve_log_destroy(ve);
+   ve_free_vdso(ve);
 #if IS_ENABLED(CONFIG_BINFMT_MISC)
kfree(ve->binfmt_misc);
 #endif
-- 
2.26.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH vz8 4/4] ve: add per-ve CLOCK_MONOTONIC time via __vclock_getttime()

2020-10-22 Thread Andrey Ryabinin
Make possible to read virtualized container's CLOCK_MONOTONIC time
via __vclock_getttime(). Record containers start time in per-ve
vdso and substruct it from the host's time on clock read.

https://jira.sw.ru/browse/PSBM-121668
Signed-off-by: Andrey Ryabinin 
---
 arch/x86/entry/vdso/vclock_gettime.c | 27 +++
 arch/x86/entry/vdso/vdso2c.c |  1 +
 arch/x86/include/asm/vdso.h  |  1 +
 kernel/ve/ve.c   | 14 ++
 4 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/vdso/vclock_gettime.c 
b/arch/x86/entry/vdso/vclock_gettime.c
index e48ca3afa091..be1de6c4cafa 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -24,6 +24,8 @@
 
 #define gtod ((vsyscall_gtod_data))
 
+u64 ve_start_time;
+
 extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts);
 extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz);
 extern time_t __vdso_time(time_t *t);
@@ -227,6 +229,21 @@ notrace static int __always_inline do_realtime(struct 
timespec *ts)
return mode;
 }
 
+static inline void timespec_sub_ns(struct timespec *ts, u64 ns)
+{
+   if ((s64)ns <= 0) {
+   ts->tv_sec += __iter_div_u64_rem(-ns, NSEC_PER_SEC, );
+   ts->tv_nsec = ns;
+   } else {
+   ts->tv_sec -= __iter_div_u64_rem(ns, NSEC_PER_SEC, );
+   if (ns) {
+   ts->tv_sec--;
+   ns = NSEC_PER_SEC - ns;
+   }
+   ts->tv_nsec = ns;
+   }
+}
+
 notrace static int __always_inline do_monotonic(struct timespec *ts)
 {
unsigned long seq;
@@ -242,9 +259,7 @@ notrace static int __always_inline do_monotonic(struct 
timespec *ts)
ns >>= gtod->shift;
} while (unlikely(gtod_read_retry(gtod, seq)));
 
-   ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, );
-   ts->tv_nsec = ns;
-
+   timespec_sub_ns(ts, ve_start_time - ns);
return mode;
 }
 
@@ -260,12 +275,16 @@ notrace static void do_realtime_coarse(struct timespec 
*ts)
 
 notrace static void do_monotonic_coarse(struct timespec *ts)
 {
+   u64 ns;
unsigned long seq;
+
do {
seq = gtod_read_begin(gtod);
ts->tv_sec = gtod->monotonic_time_coarse_sec;
-   ts->tv_nsec = gtod->monotonic_time_coarse_nsec;
+   ns = gtod->monotonic_time_coarse_nsec;
} while (unlikely(gtod_read_retry(gtod, seq)));
+
+   timespec_sub_ns(ts, ve_start_time - ns);
 }
 
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 7fab0bd96ac1..c76141e9ca16 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -110,6 +110,7 @@ struct vdso_sym required_syms[] = {
{"__kernel_rt_sigreturn", true},
{"int80_landing_pad", true},
{"linux_version_code", true},
+   {"ve_start_time", true},
 };
 
 __attribute__((format(printf, 1, 2))) __attribute__((noreturn))
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 92c7ac06828e..9c265f79a126 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -28,6 +28,7 @@ struct vdso_image {
long sym___kernel_vsyscall;
long sym_int80_landing_pad;
long sym_linux_version_code;
+   long sym_ve_start_time;
 };
 
 #ifdef CONFIG_X86_64
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 98c2e7e3d2c6..ac3dda55e9ae 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -374,6 +374,17 @@ static int ve_start_kthreadd(struct ve_struct *ve)
return err;
 }
 
+static void ve_set_vdso_time(struct ve_struct *ve, u64 time)
+{
+   u64 *vdso_start_time;
+
+   vdso_start_time = ve->vdso_64->data + ve->vdso_64->sym_ve_start_time;
+   *vdso_start_time = time;
+
+   vdso_start_time = ve->vdso_32->data + ve->vdso_32->sym_ve_start_time;
+   *vdso_start_time = time;
+}
+
 /* under ve->op_sem write-lock */
 static int ve_start_container(struct ve_struct *ve)
 {
@@ -408,6 +419,8 @@ static int ve_start_container(struct ve_struct *ve)
if (ve->start_time == 0) {
ve->start_time = tsk->start_time;
ve->real_start_time = tsk->real_start_time;
+
+   ve_set_vdso_time(ve, ve->start_time);
}
/* The value is wrong, but it is never compared to process
 * start times */
@@ -1028,6 +1041,7 @@ static ssize_t ve_ts_write(struct kernfs_open_file *of, 
char *buf,
case VE_CF_CLOCK_MONOTONIC:
now = ktime_get_ns();
target = >start_time;
+   ve_set_vdso_time(ve, now - delta_ns);
break;
case VE_CF_CLOCK_BOOTBASED:
now = ktime_get_boot_ns();
-- 
2.26.2


Re: [Devel] [PATCH rh8] mm/swap: activate swapped in pages on fault

2020-10-22 Thread Andrey Ryabinin



On 10/19/20 7:32 PM, Konstantin Khorenko wrote:
> From: Andrey Ryabinin 
> 
> Move swapped in anon pages directly to active list. This should
> help us to prevent anon thrashing. Recently swapped in pages
> has more chances to stay in memory.
> 
> https://pmc.acronis.com/browse/VSTOR-20859
> Signed-off-by: Andrey Ryabinin 
> [VvS RHEL7.8 rebase] context changes
> 
> (cherry picked from vz7 commit 134cd9b20a914080539e6310f76fe3f7b32bc710)
> Signed-off-by: Konstantin Khorenko 

Reviewed-by: Andrey Ryabinin 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH rh8] ve: Virtualize /proc/swaps to watch from inside CT

2020-10-22 Thread Andrey Ryabinin



On 10/19/20 5:27 PM, Konstantin Khorenko wrote:
> From: Kirill Tkhai 
> 
> Customize /proc/swaps when showing from !ve_is_super.
> Extracted from "Initial patch".
> 
> Signed-off-by: Kirill Tkhai 
> 
> (cherry picked from vz7 commit 88c087f1fdb4b0f7934804269df36035ab6b83eb)
> Signed-off-by: Konstantin Khorenko 


Reviewed-by: Andrey Ryabinin 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh8 3/6] ve/sched/loadavg: Calculate avenrun for Containers root cpu cgroups

2020-10-22 Thread Konstantin Khorenko
This patch is a part of vz7 commit (only avenrun part)
34a1dc1e4e3d ("sched: Account task_group::cpustat,taskstats,avenrun")

  Extracted from "Initial patch".

  Signed-off-by: Kirill Tkhai 

  +++
  ve/sched: Do not use kstat_glb_lock to update kstat_glob::nr_unint_avg

  kstat_glob::nr_unint_avg can't be updated in parallel on two or
  more cpus, so on modifications we have to protect against readers
  only.

  So, avoid using global kstat_glb_lock here, to minimize its
  sharing with another counters it protects.

  Signed-off-by: Kirill Tkhai 

(cherry picked from commit 715f311fdb4ab0b7922f9e53617c5821ae36bfaf)
Signed-off-by: Konstantin Khorenko 

+++
sched/ve: Use cfs_rq::h_nr_running to count loadavg

cfs_rq::nr_running contains number of child entities
of one level below: tasks and cfs_rq, but it does not
contain tasks from deeper levels.

Use cfs_rq::h_nr_running instead as it contains number
of tasks among all child hierarchy.

https://jira.sw.ru/browse/PSBM-81572

Signed-off-by: Kirill Tkhai 
Reviewed-by: Andrey Ryabinin 

mFixes: 028c54e613a3 ("sched: Account task_group::avenrun")

(cherry picked from vz7 commit 5f2a49a05629bd709ad6bfce83bfacc58a4db3d9)
Signed-off-by: Konstantin Khorenko 

+++
sched/ve: Iterate only VE root cpu cgroups to count loadavg

Counting loadavg we are interested in VE root cpu cgroup only,
as it's analogy of node's loadavg.

So, this patch makes iterate only such types of cpu cgroup,
when we calc loadavg.

Since this code called from interrupt, this may give positive
performance resuts.

https://jira.sw.ru/browse/PSBM-81572

Signed-off-by: Kirill Tkhai 
Reviewed-by: Andrey Ryabinin 

(cherry picked from vz7 commit 4140a241e5ec2230105f5c4513400a6b5ecea92f)
Signed-off-by: Konstantin Khorenko 

+++
sched: Export calc_load_ve()

This will be used in next patch.

Signed-off-by: Kirill Tkhai 

=
Patchset description:
Make calc_load_ve() be executed out of jiffies_lock

https://jira.sw.ru/browse/PSBM-84967

Kirill Tkhai (3):
  sched: Make calc_global_load() return true when it's need to update ve 
statistic
  sched: Export calc_load_ve()
  sched: Call calc_load_ve() out of jiffies_lock

(cherry picked from vz7 commit 738b92fb2cdd6577925a6b7019925f320cd379df)
Signed-off-by: Konstantin Khorenko 

+++
sched: Call calc_load_ve() out of jiffies_lock

jiffies_lock is a big global seqlock, which is used in many
places. In combination with another actions like smp call
functions and readers of this seqlock, system may hang for
a long time. There is already a pair of hard lockups because
of long iteration in calc_load_ve() with jiffies_lock held,
which made readers of this seqlock to spin long time.

This patch makes calc_load_ve() to use separate lock,
and this relaxes jiffies_lock. I think, this should be enough
to resolve the problem, since both the crashes I saw contains
readers of the seqlock on parallel cpus, and we won't have
to relax further (say, moving calc_load_ve() to softirq).

Note, that the principal change of this patch makes is
jiffies_lock readers on parallel cpus won't wait till calc_load_ve()
finishes, so instead of (n_readers + 1) cpus waiting till
this function completes, there will be only 1 cpu doing that.

https://jira.sw.ru/browse/PSBM-84967

Signed-off-by: Kirill Tkhai 

=
Patchset description:
Make calc_load_ve() be executed out of jiffies_lock

https://jira.sw.ru/browse/PSBM-84967

Kirill Tkhai (3):
  sched: Make calc_global_load() return true when it's need to update ve 
statistic
  sched: Export calc_load_ve()
  sched: Call calc_load_ve() out of jiffies_lock

+++
sched: really don't call calc_load_ve() under jiffies_lock

Previously we've done all preparation work for calc_load_ve() not being
executed under jiffies_lock, and thus not called from
calc_global_load(), but forgot to drop the call in calc_global_load().
So now we still call expensive calc_load_ve() under the jiffies_lock and
get NMI.

Fix that.

mFixes:19bc294a5691d ("sched: Call calc_load_ve() out of jiffies_lock")

https://jira.sw.ru/browse/PSBM-102573

Signed-off-by: Konstantin Khorenko 
Signed-off-by: Valeriy Vdovin 

(cherry picked from vz7 commit 0610b98e5b6537d2ecd99522c3cbd1aa939565e7)
Signed-off-by: Konstantin Khorenko 
---
 include/linux/sched/loadavg.h |  8 ++
 kernel/sched/loadavg.c| 50 +++
 kernel/sched/sched.h  |  1 +
 kernel/time/tick-common.c |  9 ++-
 kernel/time/tick-sched.c  |  6 -
 kernel/time/timekeeping.c |  5 +++-
 6 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h
index 34061919f880..1da5768389b7 100644
--- a/include/linux/sched/loadavg.h
+++ b/include/linux/sched/loadavg.h
@@ -16,6 +16,8 @@
  */
 extern unsigned long avenrun[];/* Load averages */
 extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
+extern 

[Devel] [PATCH rh8 5/6] ve/proc/loadavg: Virtualize /proc/loadavg in Containers

2020-10-22 Thread Konstantin Khorenko
The patch is based on following vz7 commits:
  ecdce58b214c ("sched: Export per task_group statistics_work")
  a58fb58bff1c ("Use ve init task's css instead of opening cgroup via vfs")
  5f2a49a05629 ("sched/ve: Use cfs_rq::h_nr_running to count loadavg")

vz8 rebase notes:
1) cpu cgroup vz specific file "proc.loadavg" has been dropped
2) "nr_running" field in /proc/loadavg inside a CT includes running
   realtime tasks (although they are not allowed to be run inside a CT)
   and tasks in D state (like on the Host)

Signed-off-by: Konstantin Khorenko 
---
 fs/proc/loadavg.c   | 10 ++
 include/linux/ve.h  |  8 
 kernel/sched/core.c | 40 
 kernel/ve/ve.c  | 16 
 4 files changed, 74 insertions(+)

diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 40467c3ade86..b884a1a59a3d 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -9,10 +9,20 @@
 #include 
 #include 
 #include 
+#include 
 
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
unsigned long avnrun[3];
+   struct ve_struct *ve;
+
+   ve = get_exec_env();
+   if (!ve_is_super(ve)) {
+   int ret;
+   ret = ve_show_loadavg(ve, m);
+   if (ret != -ENOSYS)
+   return ret;
+   }
 
get_avenrun(avnrun, FIXED_1/200, 0);
 
diff --git a/include/linux/ve.h b/include/linux/ve.h
index ec7dc522ac1f..0341bb915923 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -176,4 +176,12 @@ static inline void monotonic_ve_to_abs(clockid_t 
which_clock,
 
 #endif /* CONFIG_VE */
 
+struct seq_file;
+
+#if defined(CONFIG_VE) && defined(CONFIG_CGROUP_SCHED)
+int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p);
+#else
+static inline int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p) { 
return -ENOSYS; }
+#endif
+
 #endif /* _LINUX_VE_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a6100bf3f625..0116742de578 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -41,6 +41,8 @@ const_debug unsigned int sysctl_sched_features =
 #undef SCHED_FEAT
 #endif
 
+#include "../cgroup/cgroup-internal.h" /* For cgroup_task_count() */
+
 /*
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
@@ -7134,6 +7136,44 @@ static u64 cpu_rt_period_read_uint(struct 
cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+int cpu_cgroup_proc_loadavg(struct cgroup_subsys_state *css,
+   struct seq_file *p)
+{
+   struct cgroup *cgrp = css->cgroup;
+   struct task_group *tg = css_tg(css);
+   unsigned long avnrun[3];
+   int nr_running = 0;
+   int i;
+
+   avnrun[0] = tg->avenrun[0] + FIXED_1/200;
+   avnrun[1] = tg->avenrun[1] + FIXED_1/200;
+   avnrun[2] = tg->avenrun[2] + FIXED_1/200;
+
+   for_each_possible_cpu(i) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+   nr_running += tg->cfs_rq[i]->h_nr_running;
+   /*
+* We do not export nr_unint to parent task groups
+* like we do for h_nr_running, as it gives additional
+* overhead for activate/deactivate operations. So, we
+* don't account child cgroup unint tasks here.
+*/
+   nr_running += tg->cfs_rq[i]->nr_unint;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+   nr_running += tg->rt_rq[i]->rt_nr_running;
+#endif
+   }
+
+   seq_printf(p, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
+   LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+   LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+   LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
+   nr_running, cgroup_task_count(cgrp),
+   idr_get_cursor(_active_pid_ns(current)->idr));
+   return 0;
+}
+
 static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
{
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 43e37b27e887..193fdb95daab 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -1147,3 +1147,19 @@ int vz_security_protocol_check(struct net *net, int 
protocol)
}
 }
 EXPORT_SYMBOL_GPL(vz_security_protocol_check);
+
+#ifdef CONFIG_CGROUP_SCHED
+int cpu_cgroup_proc_loadavg(struct cgroup_subsys_state *css,
+   struct seq_file *p);
+
+int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p)
+{
+   struct cgroup_subsys_state *css;
+   int err;
+
+   css = ve_get_init_css(ve, cpu_cgrp_id);
+   err = cpu_cgroup_proc_loadavg(css, p);
+   css_put(css);
+   return err;
+}
+#endif /* CONFIG_CGROUP_SCHED */
-- 
2.28.0

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh8 6/6] vzstat: Add kstat_glob.nr_unint_avg real accounting

2020-10-22 Thread Konstantin Khorenko
This should be a part of commit
127bd48f3385 ("vzstat: Add vzstat module and kstat interfaces")

but depends on task_group::avenrun accounting and thus goes separately.

Signed-off-by: Konstantin Khorenko 
---
 kernel/sched/loadavg.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index c62f34033112..c76b1c842ad8 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -105,7 +105,7 @@ extern spinlock_t load_ve_lock;
 
 void calc_load_ve(void)
 {
-   unsigned long nr_active;
+   unsigned long nr_unint, nr_active;
struct task_group *tg;
int i;
 
@@ -137,6 +137,14 @@ void calc_load_ve(void)
tg->avenrun[1] = calc_load(tg->avenrun[1], EXP_5, nr_active);
tg->avenrun[2] = calc_load(tg->avenrun[2], EXP_15, nr_active);
}
+
+   nr_unint = nr_uninterruptible() * FIXED_1;
+
+   write_seqcount_begin(_glob.nr_unint_avg_seq);
+   calc_load(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint);
+   calc_load(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint);
+   calc_load(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint);
+   write_seqcount_end(_glob.nr_unint_avg_seq);
spin_unlock(_ve_lock);
 }
 #endif /* CONFIG_VE */
-- 
2.28.0

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh8 4/6] ve/sysinfo/loadavg: Virtualize loadavg values in sysinfo()

2020-10-22 Thread Konstantin Khorenko
Fixes: 688c65f8eaf1 ("ve: Virtualize sysinfo")
TODO: move appropriate hunk to this commit from the commit above

Signed-off-by: Konstantin Khorenko 
---
 kernel/sys.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/kernel/sys.c b/kernel/sys.c
index 2646c8041258..e7e07ea8d7ef 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2575,11 +2575,7 @@ static int do_sysinfo(struct sysinfo *info)
 
info->procs = nr_threads_ve(ve);
 
-#if 0
-FIXME after
-715f311fdb4a ("sched: Account task_group::cpustat,taskstats,avenrun") is ported
get_avenrun_ve(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
-#endif
}
 
/*
-- 
2.28.0

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh8 2/6] sched: Make calc_global_load() return true when it's need to update ve statistic

2020-10-22 Thread Konstantin Khorenko
From: Kirill Tkhai 

This will be used in next patches to call calc_load_ve() out of jiffies lock.

Signed-off-by: Kirill Tkhai 

=
Patchset description:
Make calc_load_ve() be executed out of jiffies_lock

https://jira.sw.ru/browse/PSBM-84967

Kirill Tkhai (3):
  sched: Make calc_global_load() return true when it's need to
 update ve statistic
  sched: Export calc_load_ve()
  sched: Call calc_load_ve() out of jiffies_lock

(cherry picked from vz commit b26208e2f8bae0bc539bef9f37d5fc650e47e092)
Signed-off-by: Konstantin Khorenko 
---
 include/linux/sched/loadavg.h | 4 +++-
 kernel/sched/loadavg.c| 5 +++--
 kernel/time/timekeeping.c | 4 ++--
 kernel/time/timekeeping.h | 2 +-
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h
index 4859bea47a7b..34061919f880 100644
--- a/include/linux/sched/loadavg.h
+++ b/include/linux/sched/loadavg.h
@@ -2,6 +2,8 @@
 #ifndef _LINUX_SCHED_LOADAVG_H
 #define _LINUX_SCHED_LOADAVG_H
 
+#include 
+
 /*
  * These are the constant used to fake the fixed-point load-average
  * counting. Some notes:
@@ -43,6 +45,6 @@ extern unsigned long calc_load_n(unsigned long load, unsigned 
long exp,
 #define LOAD_INT(x) ((x) >> FSHIFT)
 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 
-extern void calc_global_load(unsigned long ticks);
+extern bool calc_global_load(unsigned long ticks);
 
 #endif /* _LINUX_SCHED_LOADAVG_H */
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index de22da666ac7..a7b373053dc4 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -347,14 +347,14 @@ static inline void calc_global_nohz(void) { }
  *
  * Called from the global timer code.
  */
-void calc_global_load(unsigned long ticks)
+bool calc_global_load(unsigned long ticks)
 {
unsigned long sample_window;
long active, delta;
 
sample_window = READ_ONCE(calc_load_update);
if (time_before(jiffies, sample_window + 10))
-   return;
+   return false;
 
/*
 * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
@@ -377,6 +377,7 @@ void calc_global_load(unsigned long ticks)
 * catch up in bulk.
 */
calc_global_nohz();
+   return true;
 }
 
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4118179d8c75..bce92a9952f4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2192,10 +2192,10 @@ EXPORT_SYMBOL(ktime_get_coarse_ts64);
 /*
  * Must hold jiffies_lock
  */
-void do_timer(unsigned long ticks)
+bool do_timer(unsigned long ticks)
 {
jiffies_64 += ticks;
-   calc_global_load(ticks);
+   return calc_global_load(ticks);
 }
 
 /**
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 7a9b4eb7a1d5..7b6cdb0563f4 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -15,7 +15,7 @@ extern void timekeeping_warp_clock(void);
 extern int timekeeping_suspend(void);
 extern void timekeeping_resume(void);
 
-extern void do_timer(unsigned long ticks);
+extern bool do_timer(unsigned long ticks);
 extern void update_wall_time(void);
 
 extern seqlock_t jiffies_lock;
-- 
2.28.0

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh8 1/6] ve/sched: Link VE root cpu cgroups in separate list

2020-10-22 Thread Konstantin Khorenko
From: Kirill Tkhai 

The idea is to link small number of VE root cpu cgroups
to a separate list. This allows to avoid unnecessary
calculations of loadavg for VE children cpu cgroups
in next patches, and it should positively improve
the performance of calc_load_ve().

https://jira.sw.ru/browse/PSBM-81572

Signed-off-by: Kirill Tkhai 
Reviewed-by: Andrey Ryabinin 

(cherry picked from commit vz7 c9af0076fff0ac796dcbec8ef17424ae08a9f54d)
Signed-off-by: Konstantin Khorenko 

+++
ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list

Container's cpu cgroup is linked to "ve_root_list" on CT start.
But if someone holds CT's cpu cgroup while CT is being stopped,
next CT start tries to create same cpu cgroup (fails, already exists)
and links this cpu cgroup to the "ve_root_list", thus corrupting it.

As a consequence calc_load_ve() goes in an endless loop.

Let's check if task_group has been already linked to the list and skip
redundant linking.

Locking scheme change:
- drop rcu for list ve_root_list, we hold spinlocks anyway
- use "load_ve_lock" spinlock for both list add/del/iterate,
  "task_group_lock" is unrelated here

How to reproduce:

 # vzctl start 200
 # echo $$ > /sys/fs/cgroup/cpu/machine.slice/200/tasks
 # vzctl stop 200
 // At this moment VE cgroup got destroyed, but cpu cgroup is still alive
 // and linked to "ve_root_list" list

 # vzctl start 200
 // double add of same tg (same cpu cgroup) to "ve_root_list" list =>
 // list corruption => endless loop in next calc_load_ve() call

https://jira.sw.ru/browse/PSBM-88251

Signed-off-by: Konstantin Khorenko 
Acked-by: Kirill Tkhai 
Reviewed-by: Andrey Ryabinin 

v2 changes:
 - change locking scheme: drop rcu, use "load_ve_lock" everywhere
 - drop tg->linked field, check if linked using list_empty()

[VvS RHEL77b rebase]

(cherry picked from vz7 commit cba368b94c0ad159f676539f554e9cc9d53aedaa)
Signed-off-by: Konstantin Khorenko 
---
 include/linux/sched.h  |  8 
 kernel/cgroup/cgroup.c |  1 +
 kernel/sched/core.c| 31 +++
 kernel/sched/sched.h   |  4 
 4 files changed, 44 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4326aa24e9dc..cabed6a47a70 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2015,4 +2015,12 @@ static inline void rseq_syscall(struct pt_regs *regs)
 
 #endif
 
+#ifdef CONFIG_VE
+struct cgroup_subsys_state;
+extern void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css);
+void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css);
+#else /* CONFIG_VE */
+void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css) { }
+#endif /* CONFIG_VE */
+
 #endif
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 08137d43f3ab..4ee3eb24b0d1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1894,6 +1894,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
cgrp = link->cgrp;
set_bit(CGRP_VE_ROOT, >flags);
}
+   link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
 unlock:
rcu_read_unlock();
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a57956d64d6..a6100bf3f625 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6581,6 +6581,9 @@ static int cpu_cgroup_css_online(struct 
cgroup_subsys_state *css)
struct task_group *tg = css_tg(css);
struct task_group *parent = css_tg(css->parent);
 
+#ifdef CONFIG_VE
+   INIT_LIST_HEAD(>ve_root_list);
+#endif
if (parent)
sched_online_group(tg, parent);
return 0;
@@ -6590,6 +6593,7 @@ static void cpu_cgroup_css_released(struct 
cgroup_subsys_state *css)
 {
struct task_group *tg = css_tg(css);
 
+   unlink_ve_root_cpu_cgroup(css);
sched_offline_group(tg);
 }
 
@@ -6677,6 +6681,33 @@ static u64 cpu_shares_read_u64(struct 
cgroup_subsys_state *css,
return (u64) scale_load_down(tg->shares);
 }
 
+#ifdef CONFIG_VE
+LIST_HEAD(ve_root_list);
+DEFINE_SPINLOCK(load_ve_lock);
+
+void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+   struct task_group *tg = css_tg(css);
+   unsigned long flags;
+
+   spin_lock_irqsave(_ve_lock, flags);
+   BUG_ON(!(css->flags & CSS_ONLINE));
+   if (list_empty(>ve_root_list))
+   list_add(>ve_root_list, _root_list);
+   spin_unlock_irqrestore(_ve_lock, flags);
+}
+
+void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
+{
+   struct task_group *tg = css_tg(css);
+   unsigned long flags;
+
+   spin_lock_irqsave(_ve_lock, flags);
+   list_del_init(>ve_root_list);
+   spin_unlock_irqrestore(_ve_lock, flags);
+}
+#endif /* CONFIG_VE */
+
 #ifdef CONFIG_CFS_BANDWIDTH
 static DEFINE_MUTEX(cfs_constraints_mutex);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2f0c26b2c50..93bf1d78c27d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -404,6 +404,10 @@ struct task_group {
struct 

[Devel] [PATCH rh8 0/6] ve/sched/loadavg: loadavg virtualization

2020-10-22 Thread Konstantin Khorenko
Current patchset is a rework of following vz7 patches:

  5655edce75a2 vzstat: Add kstat_glob.nr_unint_avg real accounting
  7ca32010adaa ve/proc/loadavg: Virtualize /proc/loadavg in Containers

  feba442cc064 sched: Call calc_load_ve() out of jiffies_lock
  3c158be41cd2 sched: Export calc_load_ve()
  a113575a6c6e sched: Make calc_global_load() return true when it's need to
update ve statistic
  6fb0a9d805a1 sched/ve: Iterate only VE root cpu cgroups to count loadavg
  71e893d4a552 sched/ve: Use cfs_rq::h_nr_running to count loadavg
  028c54e613a3 sched: Account task_group::avenrun -> rename to
ve/sched/loadavg: Calculate avenrun for Containers root cpu
cgroups

  72108f28ffca ve/cgroup: do not link a CT cpu cgroup twice into ve_root_list
  8d5159d1f0d7 sched/ve: Link VE root cpu cgroups in separate list

loadavg values are virtualized in /proc/loadavg file and in sysinfo() output.

cpu cgroup::proc.loadavg file has been dropped (presents in vz7, but seems
nobody uses it)

This patchset obsoletes previously sent patches:
  sched: Account task_group::avenrun
  vzstat: Add kstat_glob.nr_unint_avg real accounting


Kirill Tkhai (2):
  ve/sched: Link VE root cpu cgroups in separate list
  sched: Make calc_global_load() return true when it's need to update ve
statistic

Konstantin Khorenko (4):
  ve/sched/loadavg: Calculate avenrun for Containers root cpu cgroups
  ve/sysinfo/loadavg: Virtualize loadavg values in sysinfo()
  ve/proc/loadavg: Virtualize /proc/loadavg in Containers
  vzstat: Add kstat_glob.nr_unint_avg real accounting

 fs/proc/loadavg.c | 10 +
 include/linux/sched.h |  8 
 include/linux/sched/loadavg.h | 12 +-
 include/linux/ve.h|  8 
 kernel/cgroup/cgroup.c|  1 +
 kernel/sched/core.c   | 71 +++
 kernel/sched/loadavg.c| 63 ++-
 kernel/sched/sched.h  |  5 +++
 kernel/sys.c  |  4 --
 kernel/time/tick-common.c |  9 -
 kernel/time/tick-sched.c  |  6 ++-
 kernel/time/timekeeping.c |  9 +++--
 kernel/time/timekeeping.h |  2 +-
 kernel/ve/ve.c| 16 
 14 files changed, 211 insertions(+), 13 deletions(-)

-- 
2.28.0

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel