https://jira.sw.ru/browse/PSBM-100083

Introduced 'real_start_time_ct' field in task_struct.

The value is READ:
1. When the process lives inside of a ve group and any process
inside of the same ve group wants to know it's start time by reading
it's /proc/[pid]/stat file.
2. At container suspend operation to store this value to a dump image.

The value is WRITTEN:
1. At creation time (copy_process function)
1.1. If a process is being created outside of ve group / on host, then
this value is initialized to 0
1.2. If a process is being created by process already living in ve
group, this value is calculated as host_uptime - ve_uptime.

2. During attach to ve. (ve_attach function). The process can be created on
a host and later attached to ve. It's container's start_time value has been
already initialized to 0 at creation time. After the process enters the
domain of a ve, the value should be initialized Note that the process
can be attached to a non-running container, in which case it's
start_time value should not be calculated and left initialized to 0.

3. At container restore via prctl (prctl_set_task_ct_fields function).
In this case the value is only settable outside of a container.
During restore the processes would be created from the dump image.
At restore step each process will execute prctl to set it's start_time
value, read from the dump. This would only be permitted during
pseudosuper ve mode. The value is set as is (read from the dump), without
any calculations.

Signed-off-by: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com>
---
 fs/proc/array.c            | 16 ++++------------
 include/linux/sched.h      |  5 +++++
 include/linux/ve.h         | 23 +++++++++++++++++++++++
 include/uapi/linux/prctl.h |  7 +++++++
 kernel/fork.c              | 13 +++++++++++++
 kernel/sys.c               | 23 +++++++++++++++++++++++
 kernel/ve/ve.c             |  2 ++
 7 files changed, 77 insertions(+), 12 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3aa8a7d..fb611b1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -611,19 +611,11 @@ static int do_task_stat(struct seq_file *m, struct 
pid_namespace *ns,
                (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
                                + task->real_start_time.tv_nsec;
 #ifdef CONFIG_VE
-       if (!is_super) {
-               struct timespec *ve_start_ts =
-                               &get_exec_env()->real_start_timespec;
-               start_time -=
-                       (unsigned long long)ve_start_ts->tv_sec * NSEC_PER_SEC
-                               + ve_start_ts->tv_nsec;
-       }
-       /* tasks inside a CT can have negative start time e.g. if the CT was
-        * migrated from another hw node, in which case we will report 0 in
-        * order not to confuse userspace */
-       if ((s64)start_time < 0)
-               start_time = 0;
+       if (!is_super)
+               start_time = (unsigned long long)
+                       timespec_to_ns(&task->real_start_time_ct);
 #endif
+
        /* convert nsec -> ticks */
        start_time = nsec_to_clock_t(start_time);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 07f9954..0832904 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1934,6 +1934,11 @@ struct task_struct {
        struct wake_q_node wake_q;
        struct prev_cputime prev_cputime;
        struct vtime vtime;
+       /*
+        * this is a container-side copy of 'real_start_time' field
+        * shown from inside of a container and modified by host.
+        */
+       struct timespec real_start_time_ct;
 #endif /* __GENKSYMS__ */
 };
 
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 9d60838..088e274 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -199,6 +199,29 @@ static inline struct ve_struct *cgroup_ve(struct cgroup 
*cgroup)
                        struct ve_struct, css);
 }
 
+static inline void ve_try_set_task_start_time(struct ve_struct *ve,
+       struct task_struct *t)
+{
+       struct timespec host_uptime;
+
+       /*
+        * mitigate memory access reordering risks by doing double check,
+        * 'is_running' could be read as 1 before we see
+        * 'real_start_timespec' updated here. If it's still 0,
+        * we know 'is_running' is being modified right NOW in
+        * parallel so it's safe to say that start time is also 0
+        */
+       if (!ve->is_running || !timespec_to_ns(&ve->real_start_timespec)) {
+               t->real_start_time_ct.tv_sec = 0;
+               t->real_start_time_ct.tv_nsec = 0;
+       } else {
+               do_posix_clock_monotonic_gettime(&host_uptime);
+               monotonic_to_bootbased(&host_uptime);
+               t->real_start_time_ct = timespec_sub(host_uptime,
+                       ve->real_start_timespec);
+       }
+}
+
 extern unsigned long long ve_relative_clock(struct timespec * ts);
 extern void monotonic_abs_to_ve(clockid_t which_clock, struct timespec *tp);
 extern void monotonic_ve_to_abs(clockid_t which_clock, struct timespec *tp);
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 02376de..b185113 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -204,5 +204,12 @@ struct prctl_mm_map {
 # define PR_SPEC_DISABLE               (1UL << 2)
 # define PR_SPEC_FORCE_DISABLE         (1UL << 3)
 # define PR_SPEC_DISABLE_NOEXEC                (1UL << 4)
+/* Set task container related fields */
+#define PR_SET_TASK_CT_FIELDS  1000
+#define PR_TASK_CT_FIELDS_START_TIME   (1ULL << 0)
+
+struct prctl_task_ct_fields {
+       __s64 real_start_time;
+};
 
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 3d74228..2314eb8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1350,6 +1350,10 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
        struct task_struct *p;
        void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
 
+#ifdef CONFIG_VE
+       struct ve_struct *ve = get_exec_env();
+#endif
+
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);
 
@@ -1472,6 +1476,15 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
        do_posix_clock_monotonic_gettime(&p->start_time);
        p->real_start_time = p->start_time;
        monotonic_to_bootbased(&p->real_start_time);
+
+       p->real_start_time_ct.tv_sec = 0;
+       p->real_start_time_ct.tv_nsec = 0;
+
+#ifdef CONFIG_VE
+       if (!ve_is_super(ve))
+               ve_try_set_task_start_time(ve, p);
+#endif
+
        p->io_context = NULL;
        p->audit_context = NULL;
        if (clone_flags & CLONE_THREAD)
diff --git a/kernel/sys.c b/kernel/sys.c
index c4d633ef..2ce16c7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2457,6 +2457,26 @@ static int prctl_get_tid_address(struct task_struct *me, 
int __user **tid_addr)
 }
 #endif
 
+static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg,
+               unsigned long flags)
+{
+       struct prctl_task_ct_fields params;
+#ifdef CONFIG_VE
+       struct ve_struct *ve = t->task_ve;
+
+       if (!ve_is_super(ve) && !ve->is_pseudosuper)
+               return -EPERM;
+#endif
+
+       if (copy_from_user(&params, (const void __user *)arg, sizeof(params)))
+               return -EFAULT;
+
+       if (flags & PR_TASK_CT_FIELDS_START_TIME)
+               t->real_start_time_ct = ns_to_timespec(params.real_start_time);
+
+       return 0;
+}
+
 int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
 {
        return -EINVAL;
@@ -2684,6 +2704,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, 
unsigned long, arg3,
                        return -EINVAL;
                error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
                break;
+       case PR_SET_TASK_CT_FIELDS:
+               error = prctl_set_task_ct_fields(me, arg2, arg3);
+               break;
        default:
                error = -EINVAL;
                break;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index ad3a698..f3970e8 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -850,6 +850,8 @@ static void ve_attach(struct cgroup *cg, struct 
cgroup_taskset *tset)
                /* Leave parent exec domain */
                task->parent_exec_id--;
 
+               ve_try_set_task_start_time(ve, task);
+
                task->task_ve = ve;
        }
 
-- 
1.8.3.1

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to