Initially I missed it, but ignoring retcode is really bad mistake.

On 1/13/20 11:25 AM, Valeriy Vdovin wrote:
> https://jira.sw.ru/browse/PSBM-100083
> Signed-off-by: Valeriy Vdovin <valeriy.vdo...@virtuozzo.com>
> ---
>   criu/cr-dump.c         | 49 ++++++++++++++++++++++++++++++++++++
>   criu/cr-restore.c      | 68 
> ++++++++++++++++++++++++++++++--------------------
>   criu/include/crtools.h | 31 +++++++++++++++++++++++
>   images/core.proto      |  2 ++
>   4 files changed, 123 insertions(+), 27 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 45626e8..751141f 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1037,6 +1037,47 @@ int dump_thread_core(int pid, CoreEntry *core, const 
> struct parasite_dump_thread
>       return ret;
>   }
>   
> +struct get_internal_start_time_rq {
> +     int pid;
> +     unsigned long long result;
> +};
> +
> +static int child_get_internal_start_time(void *arg)
> +{
> +     struct proc_pid_stat p;
> +     struct get_internal_start_time_rq *r =
> +             (struct get_internal_start_time_rq *)arg;
> +
> +     /* We need to join ve to access container relative
> +      * value of task's start_time, otherwize we will see
> +      * start_time visible to host.
> +      */
> +     join_veX(r->pid);

Not checking return code here.

> +
> +     parse_pid_stat(r->pid, &p);

And here.

> +     r->result = p.start_time;
> +     return 0;
> +}
> +
> +static int dump_task_internal_start_time(int pid, TaskCoreEntry *tc)
> +{
> +     int ret;
> +     struct get_internal_start_time_rq r = {
> +             .pid = pid,
> +             .result = 0
> +     };
> +
> +     ret = call_in_child_process(child_get_internal_start_time, &r);
> +     if (ret) {
> +             pr_err("Failed to exec in child\n");
> +             return ret;
> +     }
> +
> +     tc->has_start_time = 1;
> +     tc->start_time = r.result;
> +     return 0;
> +}
> +
>   static int dump_task_core_all(struct parasite_ctl *ctl,
>                             struct pstree_item *item,
>                             const struct proc_pid_stat *stat,
> @@ -1063,6 +1104,14 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
>       core->tc->task_state = item->pid->state;
>       core->tc->exit_code = 0;
>   
> +     ret = dump_task_internal_start_time(pid, core->tc);
> +     if (ret) {
> +             pr_err("Failed to dump start_time for task %d\n", pid);
> +             goto err;
> +     }
> +
> +     pr_info("Dumped start_time of task %d is %lu\n", pid, 
> core->tc->start_time);
> +
>       if (stat->tty_nr) {
>               struct pstree_item *p = item;
>   
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 170beab..c9777ae 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -947,6 +947,40 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry 
> *tc)
>   static int prepare_itimers(int pid, struct task_restore_args *args, 
> CoreEntry *core);
>   static int prepare_mm(pid_t pid, struct task_restore_args *args);
>   
> +static int restore_start_time(int pid, CoreEntry *core)
> +{
> +     unsigned long long total_nsec;
> +     unsigned long flags;
> +     long tps;
> +     struct prctl_task_ct_fields ct_fields;
> +
> +     if (!core->tc->has_start_time) {
> +             pr_warn("Skipping restore_start_time for old image version.\n");
> +             return -1;
> +     }
> +
> +     tps = sysconf(_SC_CLK_TCK);
> +     if (tps == -1) {
> +             pr_perror("Failed to get clock ticks via sysconf");
> +             return -1;
> +     }
> +
> +     total_nsec = core->tc->start_time * (NSEC_PER_SEC / tps);
> +
> +     ct_fields.real_start_time = total_nsec;
> +     flags = PR_TASK_CT_FIELDS_START_TIME;
> +
> +     if (prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 
> 0)) {
> +             pr_perror("Can't set process start time");
> +             return -1;
> +     }
> +
> +     pr_info("Restored start_time of task %d is %lu\n",
> +             pid, core->tc->start_time);
> +
> +     return 0;
> +}
> +
>   static int restore_one_alive_task(int pid, CoreEntry *core)
>   {
>       unsigned args_len;
> @@ -955,6 +989,8 @@ static int restore_one_alive_task(int pid, CoreEntry 
> *core)
>   
>       rst_mem_switch_to_private();
>   
> +     restore_start_time(pid, core);

And here. You should explicitly return 0 in case of EINVAL returned from 
prctl and write a warning that prctl is not supported, and you should 
also return 0 in case of !has_start_time for backward compatibility. 
Sysconf errors and all other errors from prctl do matter, we can't 
ignore them.

I think we need to rework your patch via kdat to call 
prctl(PR_SET_TASK_CT_FIELDS) only once if it's not supported by kernel.

> +
>       args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) *
>                       current->nr_threads, page_size());
>       ta = mmap(NULL, args_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | 
> MAP_PRIVATE, 0, 0);
> @@ -1122,7 +1158,7 @@ static int wait_on_helpers_zombies(void)
>   
>   static int wait_exiting_children(char *prefix);
>   
> -static int restore_one_zombie(CoreEntry *core)
> +static int restore_one_zombie(int pid, CoreEntry *core)
>   {
>       int exit_code = core->tc->exit_code;
>   
> @@ -1134,6 +1170,8 @@ static int restore_one_zombie(CoreEntry *core)
>       if (lazy_pages_setup_zombie(vpid(current)))
>               return -1;
>   
> +     restore_start_time(pid, core);
> +
>       prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
>   
>       if (task_entries != NULL) {
> @@ -1142,6 +1180,7 @@ static int restore_one_zombie(CoreEntry *core)
>               zombie_prepare_signals();
>       }
>   
> +
>       if (exit_code & 0x7f) {
>               int signr;
>   
> @@ -1344,7 +1383,7 @@ static int restore_one_task(int pid, CoreEntry *core)
>       if (task_alive(current))
>               ret = restore_one_alive_task(pid, core);
>       else if (current->pid->state == TASK_DEAD)
> -             ret = restore_one_zombie(core);
> +             ret = restore_one_zombie(pid, core);
>       else if (current->pid->state == TASK_HELPER) {
>               ret = restore_one_helper();
>       } else {
> @@ -2156,31 +2195,6 @@ static int write_restored_pid(void)
>   
>   extern char *get_dumpee_veid(pid_t pid_real);
>   
> -#define join_veX(pid)        join_ve(pid, true)
> -
> -/*
> - * Use join_ve0 very carefully! We have checks in kernel to prohibit 
> execution
> - * of files on CT mounts for security. All mounts created after join_veX are
> - * marked as CT mounts, including all mounts of the root_yard temporary 
> mntns.
> - * So if you do join_ve0 you can be blocked from executing anything.
> - *
> - * https://jira.sw.ru/browse/PSBM-98702
> - *
> - * note: If for some reason we will desperately need to execute binaries from
> - * mounts in the root_yard temporary mntns from VE0 we have an option:
> - *
> - * In restore_root_task before calling join_veX we can clone a helper process
> - * which will create CT userns and mntns first (all mounts are marked as host
> - * mounts), next after join_veX in restore_root_task we create another helper
> - * process which setns'es to these user and mnt namespaces, and from these
> - * helper we can clone CT init process obviousely without CLONE_NEWNS and
> - * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> - * but all mounts cloned from host will be marked as host mounts, and 
> execution
> - * on them will be allowed even from VE0.
> - */
> -
> -#define join_ve0(pid)        join_ve(pid, false)
> -
>   /*
>    * To eliminate overhead we don't parse VE cgroup mountpoint
>    * but presume to find it in known place. Otherwise simply
> diff --git a/criu/include/crtools.h b/criu/include/crtools.h
> index c3f7cb3..acb1faf 100644
> --- a/criu/include/crtools.h
> +++ b/criu/include/crtools.h
> @@ -44,6 +44,37 @@ extern void pr_check_features(const char *offset, const 
> char *sep, int width);
>                       .actor = name##_cb,                     \
>       }
>   
> +#define join_veX(pid)        join_ve(pid, true)
> +
> +/*
> + * Use join_ve0 very carefully! We have checks in kernel to prohibit 
> execution
> + * of files on CT mounts for security. All mounts created after join_veX are
> + * marked as CT mounts, including all mounts of the root_yard temporary 
> mntns.
> + * So if you do join_ve0 you can be blocked from executing anything.
> + *
> + * https://jira.sw.ru/browse/PSBM-98702
> + *
> + * note: If for some reason we will desperately need to execute binaries from
> + * mounts in the root_yard temporary mntns from VE0 we have an option:
> + *
> + * In restore_root_task before calling join_veX we can clone a helper process
> + * which will create CT userns and mntns first (all mounts are marked as host
> + * mounts), next after join_veX in restore_root_task we create another helper
> + * process which setns'es to these user and mnt namespaces, and from these
> + * helper we can clone CT init process obviousely without CLONE_NEWNS and
> + * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> + * but all mounts cloned from host will be marked as host mounts, and 
> execution
> + * on them will be allowed even from VE0.
> + */
> +
> +#define join_ve0(pid)        join_ve(pid, false)
> +
> +/*
> + * To eliminate overhead we don't parse VE cgroup mountpoint
> + * but presume to find it in known place. Otherwise simply
> + * don't enter into veX with one warning.
> + */
> +
>   int join_ve(pid_t pid, bool veX);
>   
>   #endif /* __CR_CRTOOLS_H__ */
> diff --git a/images/core.proto b/images/core.proto
> index 6ef5f50..c164c7a 100644
> --- a/images/core.proto
> +++ b/images/core.proto
> @@ -50,6 +50,7 @@ message task_core_entry_VZ730 {
>       optional int32                  tty_nr          = 15;
>       optional int32                  tty_pgrp        = 16;
>       repeated sa_entry               sigactions      = 17;
> +     optional uint64                 start_time      = 19;
>   }
>   
>   message task_core_entry {
> @@ -79,6 +80,7 @@ message task_core_entry {
>       repeated sa_entry               sigactions      = 15;
>       optional int32                  tty_nr          = 16;
>       optional int32                  tty_pgrp        = 17;
> +     optional uint64                 start_time      = 19;
>   }
>   
>   message task_kobj_ids_entry {
> 

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to