Reviewed-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com> On 1/14/20 9:54 PM, Valeriy Vdovin wrote: > https://jira.sw.ru/browse/PSBM-100083 > > Introduced 'real_start_time_ct' field in task_struct. > > The value is READ: > 1. When the process lives inside of a ve group and any process > inside of the same ve group wants to know it's start time by reading > it's /proc/[pid]/stat file. > 2. At container suspend operation to store this value to a dump image. > > The value is WRITTEN: > 1. At creation time (copy_process function) > 1.1. If a process is being created outside of ve group / on host, then > this value is initialized to 0 > 1.2. If a process is being created by process already living in ve > group, this value is calculated as host_uptime - ve_uptime. > > 2. During attach to ve. (ve_attach function). The process can be created on > a host and later attached to ve. It's container's start_time value has been > already initialized to 0 at creation time. After the process enters the > domain of a ve, the value should be initialized Note that the process > can be attached to a non-running container, in which case it's > start_time value should not be calculated and left initialized to 0. > > 3. At container restore via prctl (prctl_set_task_ct_fields function). > In this case the value is only settable outside of a container. > During restore the processes would be created from the dump image. > At restore step each process will execute prctl to set it's start_time > value, read from the dump. This would only be permitted during > pseudosuper ve mode. The value is set as is (read from the dump), without > any calculations. > > Signed-off-by: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com> > --- > fs/proc/array.c | 16 ++++------------ > include/linux/sched.h | 5 +++++ > include/linux/ve.h | 23 +++++++++++++++++++++++ > include/uapi/linux/prctl.h | 7 +++++++ > kernel/fork.c | 13 +++++++++++++ > kernel/sys.c | 23 +++++++++++++++++++++++ > kernel/ve/ve.c | 2 ++ > 7 files changed, 77 insertions(+), 12 deletions(-) > > diff --git a/fs/proc/array.c b/fs/proc/array.c > index 3aa8a7d..fb611b1 100644 > --- a/fs/proc/array.c > +++ b/fs/proc/array.c > @@ -611,19 +611,11 @@ static int do_task_stat(struct seq_file *m, struct > pid_namespace *ns, > (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC > + task->real_start_time.tv_nsec; > #ifdef CONFIG_VE > - if (!is_super) { > - struct timespec *ve_start_ts = > - &get_exec_env()->real_start_timespec; > - start_time -= > - (unsigned long long)ve_start_ts->tv_sec * NSEC_PER_SEC > - + ve_start_ts->tv_nsec; > - } > - /* tasks inside a CT can have negative start time e.g. if the CT was > - * migrated from another hw node, in which case we will report 0 in > - * order not to confuse userspace */ > - if ((s64)start_time < 0) > - start_time = 0; > + if (!is_super) > + start_time = (unsigned long long) > + timespec_to_ns(&task->real_start_time_ct); > #endif > + > /* convert nsec -> ticks */ > start_time = nsec_to_clock_t(start_time); > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 07f9954..0832904 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1934,6 +1934,11 @@ struct task_struct { > struct wake_q_node wake_q; > struct prev_cputime prev_cputime; > struct vtime vtime; > + /* > + * this is a container-side copy of 'real_start_time' field > + * shown from inside of a container and modified by host. > + */ > + struct timespec real_start_time_ct; > #endif /* __GENKSYMS__ */ > }; > > diff --git a/include/linux/ve.h b/include/linux/ve.h > index 9d60838..088e274 100644 > --- a/include/linux/ve.h > +++ b/include/linux/ve.h > @@ -199,6 +199,29 @@ static inline struct ve_struct *cgroup_ve(struct cgroup > *cgroup) > struct ve_struct, css); > } > > +static inline void ve_try_set_task_start_time(struct ve_struct *ve, > + struct task_struct *t) > +{ > + struct timespec host_uptime; > + > + /* > + * mitigate memory access reordering risks by doing double check, > + * 'is_running' could be read as 1 before we see > + * 'real_start_timespec' updated here. If it's still 0, > + * we know 'is_running' is being modified right NOW in > + * parallel so it's safe to say that start time is also 0 > + */ > + if (!ve->is_running || !timespec_to_ns(&ve->real_start_timespec)) { > + t->real_start_time_ct.tv_sec = 0; > + t->real_start_time_ct.tv_nsec = 0; > + } else { > + do_posix_clock_monotonic_gettime(&host_uptime); > + monotonic_to_bootbased(&host_uptime); > + t->real_start_time_ct = timespec_sub(host_uptime, > + ve->real_start_timespec); > + } > +} > + > extern unsigned long long ve_relative_clock(struct timespec * ts); > extern void monotonic_abs_to_ve(clockid_t which_clock, struct timespec *tp); > extern void monotonic_ve_to_abs(clockid_t which_clock, struct timespec *tp); > diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h > index 02376de..b185113 100644 > --- a/include/uapi/linux/prctl.h > +++ b/include/uapi/linux/prctl.h > @@ -204,5 +204,12 @@ struct prctl_mm_map { > # define PR_SPEC_DISABLE (1UL << 2) > # define PR_SPEC_FORCE_DISABLE (1UL << 3) > # define PR_SPEC_DISABLE_NOEXEC (1UL << 4) > +/* Set task container related fields */ > +#define PR_SET_TASK_CT_FIELDS 1000 > +#define PR_TASK_CT_FIELDS_START_TIME (1ULL << 0) > + > +struct prctl_task_ct_fields { > + __s64 real_start_time; > +}; > > #endif /* _LINUX_PRCTL_H */ > diff --git a/kernel/fork.c b/kernel/fork.c > index 3d74228..2314eb8 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1350,6 +1350,10 @@ static struct task_struct *copy_process(unsigned long > clone_flags, > struct task_struct *p; > void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {}; > > +#ifdef CONFIG_VE > + struct ve_struct *ve = get_exec_env(); > +#endif > + > if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) > return ERR_PTR(-EINVAL); > > @@ -1472,6 +1476,15 @@ static struct task_struct *copy_process(unsigned long > clone_flags, > do_posix_clock_monotonic_gettime(&p->start_time); > p->real_start_time = p->start_time; > monotonic_to_bootbased(&p->real_start_time); > + > + p->real_start_time_ct.tv_sec = 0; > + p->real_start_time_ct.tv_nsec = 0; > + > +#ifdef CONFIG_VE > + if (!ve_is_super(ve)) > + ve_try_set_task_start_time(ve, p); > +#endif > + > p->io_context = NULL; > p->audit_context = NULL; > if (clone_flags & CLONE_THREAD) > diff --git a/kernel/sys.c b/kernel/sys.c > index c4d633ef..2ce16c7 100644 > --- a/kernel/sys.c > +++ b/kernel/sys.c > @@ -2457,6 +2457,26 @@ static int prctl_get_tid_address(struct task_struct > *me, int __user **tid_addr) > } > #endif > > +static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg, > + unsigned long flags) > +{ > + struct prctl_task_ct_fields params; > +#ifdef CONFIG_VE > + struct ve_struct *ve = t->task_ve; > + > + if (!ve_is_super(ve) && !ve->is_pseudosuper) > + return -EPERM; > +#endif > + > + if (copy_from_user(¶ms, (const void __user *)arg, sizeof(params))) > + return -EFAULT; > + > + if (flags & PR_TASK_CT_FIELDS_START_TIME) > + t->real_start_time_ct = ns_to_timespec(params.real_start_time); > + > + return 0; > +} > + > int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long > which) > { > return -EINVAL; > @@ -2684,6 +2704,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, > arg2, unsigned long, arg3, > return -EINVAL; > error = arch_prctl_spec_ctrl_set(me, arg2, arg3); > break; > + case PR_SET_TASK_CT_FIELDS: > + error = prctl_set_task_ct_fields(me, arg2, arg3); > + break; > default: > error = -EINVAL; > break; > diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c > index ad3a698..f3970e8 100644 > --- a/kernel/ve/ve.c > +++ b/kernel/ve/ve.c > @@ -850,6 +850,8 @@ static void ve_attach(struct cgroup *cg, struct > cgroup_taskset *tset) > /* Leave parent exec domain */ > task->parent_exec_id--; > > + ve_try_set_task_start_time(ve, task); > + > task->task_ve = ve; > } > >
-- Best regards, Tikhomirov Pavel Software Developer, Virtuozzo. _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel