[linuxkernelnewbies] implementation of execve() --> how a process can entirely wipe away a previously executing process

Peter Teoh Mon, 01 Sep 2008 22:11:11 -0700


And thus calling "exec" recursively - does not recurse....


/*
 * sys_execve() executes a new program.
 */
int do_execve(char * filename,
        char __user *__user *argv,
        char __user *__user *envp,
        struct pt_regs * regs)
{
        struct linux_binprm *bprm;
        struct file *file;
        struct files_struct *displaced;
        int retval;

        retval = unshare_files(&displaced);
        if (retval)
                goto out_ret;

        retval = -ENOMEM;
        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
        if (!bprm)
                goto out_files;

        file = open_exec(filename);
        retval = PTR_ERR(file);
        if (IS_ERR(file))
                goto out_kfree;

        sched_exec();

        bprm->file = file;
        bprm->filename = filename;
        bprm->interp = filename;

        retval = bprm_mm_init(bprm);
        if (retval)
                goto out_file;

        bprm->argc = count(argv, MAX_ARG_STRINGS);
        if ((retval = bprm->argc) < 0)
                goto out_mm;

        bprm->envc = count(envp, MAX_ARG_STRINGS);
        if ((retval = bprm->envc) < 0)
                goto out_mm;

        retval = security_bprm_alloc(bprm);
        if (retval)
                goto out;

        retval = prepare_binprm(bprm);
        if (retval < 0)
                goto out;

        retval = copy_strings_kernel(1, &bprm->filename, bprm);
        if (retval < 0)
                goto out;

        bprm->exec = bprm->p;
        retval = copy_strings(bprm->envc, envp, bprm);
        if (retval < 0)
                goto out;

        retval = copy_strings(bprm->argc, argv, bprm);
        if (retval < 0)
                goto out;

        current->flags &= ~PF_KTHREAD;
        retval = search_binary_handler(bprm,regs);
        if (retval >= 0) {
                /* execve success */
                security_bprm_free(bprm);
                acct_update_integrals(current);
                free_bprm(bprm);
                if (displaced)
                        put_files_struct(displaced);
                return retval;
        }

out:
        if (bprm->security)
                security_bprm_free(bprm);

out_mm:
        if (bprm->mm)
                mmput (bprm->mm);

out_file:
        if (bprm->file) {
                allow_write_access(bprm->file);
                fput(bprm->file);
        }
out_kfree:
        free_bprm(bprm);

out_files:
        if (displaced)
                reset_files_struct(displaced);
out_ret:
        return retval;
}


and CR3 loading is done in switch_mm():

static inline void switch_mm(struct mm_struct *prev,
                             struct mm_struct *next,
                             struct task_struct *tsk)
{
        int cpu = smp_processor_id();

        if (likely(prev != next)) {
                /* stop flush ipis for the previous mm */
                cpu_clear(cpu, prev->cpu_vm_mask);
#ifdef CONFIG_SMP
                per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
                per_cpu(cpu_tlbstate, cpu).active_mm = next;
#endif
                cpu_set(cpu, next->cpu_vm_mask);

                /* Re-load page tables */
                load_cr3(next->pgd);

                /*
                 * load the LDT, if the LDT is different:
                 */
                if (unlikely(prev->context.ldt != next->context.ldt))
                        load_LDT_nolock(&next->context);
        }
#ifdef CONFIG_SMP
        else {
                per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
                BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);

                if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
                        /* We were in lazy tlb mode and leave_mm disabled
                         * tlb flush IPI delivery. We must reload %cr3.
                         */
                        load_cr3(next->pgd);
                        load_LDT_nolock(&next->context);
                }
        }
#endif
}

#define deactivate_mm(tsk, mm)                  \
        asm("movl %0,%%gs": :"r" (0));

#endif



=========================================

/*
 *
 * The flush IPI assumes that a thread switch happens in this order:
 * [cpu0: the cpu that switches]
 * 1) switch_mm() either 1a) or 1b)
 * 1a) thread switch to a different mm
 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
 *      Stop ipi delivery for the old mm. This is not synchronized with
 *      the other cpus, but smp_invalidate_interrupt ignore flush ipis
 *      for the wrong mm, and in the worst case we perform a superfluous
 *      tlb flush.
 * 1a2) set cpu_tlbstate to TLBSTATE_OK
 *      Now the smp_invalidate_interrupt won't call leave_mm if cpu0
 *      was in lazy tlb mode.
 * 1a3) update cpu_tlbstate[].active_mm
 *      Now cpu0 accepts tlb flushes for the new mm.
 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
 *      Now the other cpus will send tlb flush ipis.
 * 1a4) change cr3.
 * 1b) thread switch without mm change
 *      cpu_tlbstate[].active_mm is correct, cpu0 already handles
 *      flush ipis.
 * 1b1) set cpu_tlbstate to TLBSTATE_OK
 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
 *      Atomically set the bit [other cpus will start sending flush ipis],
 *      and test the bit.
 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
 * 2) switch %%esp, ie current
 *
 * The interrupt must handle 2 special cases:
 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
 *   runs in kernel space, the cpu could load tlb entries for user space
 *   pages.
 *
 * The good news is that cpu_tlbstate is local to each cpu, no
 * write/read ordering problems.
 */

/*
 * TLB flush IPI:
 *
 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
 * 2) Leave the mm if we are in the lazy tlb mode.
 */


=========================================

int flush_old_exec(struct linux_binprm * bprm)
{
        char * name;
        int i, ch, retval;
        char tcomm[sizeof(current->comm)];

        /*
         * Make sure we have a private signal table and that
         * we are unassociated from the previous thread group.
         */
        retval = de_thread(current);
        if (retval)
                goto out;

        set_mm_exe_file(bprm->mm, bprm->file);

        /*
         * Release all of the old mmap stuff
         */
        retval = exec_mmap(bprm->mm);===============>which call
activate_mm(), which load CR3.
        if (retval)
                goto out;

        bprm->mm = NULL;                /* We're using it now */

        /* This is the point of no return */

        current->sas_ss_sp = current->sas_ss_size = 0;

        if (current->euid == current->uid && current->egid == current->gid)
                set_dumpable(current->mm, 1);
        else
                set_dumpable(current->mm, suid_dumpable);

        name = bprm->filename;

        /* Copies the binary name from after last slash */
        for (i=0; (ch = *(name++)) != '\0';) {
                if (ch == '/')
                        i = 0; /* overwrite what we wrote */
                else
                        if (i < (sizeof(tcomm) - 1))
                                tcomm[i++] = ch;
        }
        tcomm[i] = '\0';
        set_task_comm(current, tcomm);

        current->flags &= ~PF_RANDOMIZE;
        flush_thread();

        /* Set the new mm task size. We have to do that late because it may
         * depend on TIF_32BIT which is only updated in flush_thread() on




=========================================

static int exec_mmap(struct mm_struct *mm)
{
        struct task_struct *tsk;
        struct mm_struct * old_mm, *active_mm;

        /* Notify parent that we're no longer interested in the old VM */
        tsk = current;
        old_mm = current->mm;
        mm_release(tsk, old_mm);

        if (old_mm) {
                /*
                 * Make sure that if there is a core dump in progress
                 * for the old mm, we get out and die instead of going
                 * through with the exec.  We must hold mmap_sem around
                 * checking core_state and changing tsk->mm.
                 */
                down_read(&old_mm->mmap_sem);
                if (unlikely(old_mm->core_state)) {
                        up_read(&old_mm->mmap_sem);
                        return -EINTR;
                }
        }
        task_lock(tsk);
        active_mm = tsk->active_mm;
        tsk->mm = mm;
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
        task_unlock(tsk);
        mm_update_next_owner(old_mm);
        arch_pick_mmap_layout(mm);
        if (old_mm) {

[linuxkernelnewbies] implementation of execve() --> how a process can entirely wipe away a previously executing process

Reply via email to