Hi Thomas,

I need to read it in detail - this is based on quickly scanning through it.

Based on my "experimental" notes, if you do not have a periodic timer 
setup in that part you have commented and marked as FIXME userspace at 
some point hangs (at least used to in 2.6.31 to 3.4, I have not tried 
later).

If you have managed to fix it somewhere else, congrats. If 4.x does not 
do that any more - excellent too.

I will read it properly on the plane (travelling) tomorrow and reply 
sometimes on Tue.

A.

On 03/05/15 16:46, Thomas Meyer wrote:
> Am Samstag, den 02.05.2015, 12:08 +0100 schrieb Anton Ivanov:
>> On 02/05/15 10:48, Thomas Meyer wrote:
>>> Hi,
>>>
>>> I did port Anton's v4 patch to v4.1-rc1-56-g3d99e3f and run it the last
>>> two days.
>>>
>>> Original v4 from Anton can be found here:
>>> https://sourceforge.net/p/user-mode-linux/mailman/message/32856805/
>>>    
>>> Issues addressed in v5 version:
>>> - Ported to v4.1-rc1-56-g3d99e3f
>>> - Replaced IRQF_DISABLED with IRQF_TIMER in request_irq(). I'm not sure
>>> if this is the right thing to do.
>>> - Removed unused variable/function: bbev and sleep_time()
>>>
>>> What I don't understand is:
>>> - why is SIGVTALRM/itimer is still used? wouldn't be enough to only use
>>> the timer created by timer_create and SIGUSR2?
>> Pacing userspace. There are a couple of places where it is hardwired so
>> deeply into it that I was unable to remove it and replace it. For
>> example there is one place where it is set-up using a magic number
>> direct syscall incantation in the memory management subsystem and so on.
>>
>>> - why are still both IRQs are still registered in the uml kernel?
>>> request_irq() for TIMER_IRQ and HRTIMER_IRQ?
>> See above.
>>
>>> - doesn't occur duplicate signals now? One by SIGUSR2 and one from
>>> SIGVTALRM?
>> No
>>
>> VTALRM is still used for userpace pacing. All kernel stuff internally
>> will use USR2.
>>
>> This results in:
>>
>> Userspace applications still having a relatively imprecise and expensive
>> itimer based clock. All kernel stuff such as QoS, timeouts and timers in
>> any kernel drivers, tcp timers will use the new high res timer.
>>
>> I would love to kill the old timer completely as this will make the
>> userspace considerably more responsive, however some of the bits like
>> the magic incantantions in the stub setups are beyond my understanding.
> Hi,
>
> I've working on below patch based on your work, to completely kill the
> itimer. It still has some error and things I don't understand yet, but
> hopefully I'm heading in the right direction!
>
> Comments are welcome!
>
> diff --git a/arch/um/Makefile b/arch/um/Makefile
> index 17d4460..a4a434f 100644
> --- a/arch/um/Makefile
> +++ b/arch/um/Makefile
> @@ -130,7 +130,7 @@ export LDS_ELF_FORMAT := $(ELF_FORMAT)
>   # The wrappers will select whether using "malloc" or the kernel allocator.
>   LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc
>   
> -LD_FLAGS_CMDLINE = $(foreach opt,$(LDFLAGS),-Wl,$(opt))
> +LD_FLAGS_CMDLINE = $(foreach opt,$(LDFLAGS),-Wl,$(opt)) -lrt
>   
>   # Used by link-vmlinux.sh which has special support for um link
>   export CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE)
> diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h
> index 4a2037f..0f2a5b1 100644
> --- a/arch/um/include/asm/irq.h
> +++ b/arch/um/include/asm/irq.h
> @@ -16,8 +16,9 @@
>   #define TELNETD_IRQ                 12
>   #define XTERM_IRQ           13
>   #define RANDOM_IRQ          14
> +#define HRTIMER_IRQ          15
>   
> -#define LAST_IRQ RANDOM_IRQ
> +#define LAST_IRQ HRTIMER_IRQ
>   #define NR_IRQS (LAST_IRQ + 1)
>   
>   #endif
> diff --git a/arch/um/include/shared/as-layout.h 
> b/arch/um/include/shared/as-layout.h
> index ca1843e..798aa6e 100644
> --- a/arch/um/include/shared/as-layout.h
> +++ b/arch/um/include/shared/as-layout.h
> @@ -17,7 +17,7 @@
>   
>   /* Some constant macros are used in both assembler and
>    * C code.  Therefore we cannot annotate them always with
> - * 'UL' and other type specifiers unilaterally.  We
> + * 'UL' and other type specifiers unilaterally. We
>    * use the following macros to deal with this.
>    */
>   
> @@ -28,6 +28,13 @@
>   #define _UML_AC(X, Y)       __UML_AC(X, Y)
>   #endif
>   
> +/**
> + * userspace stub address space layout:
> + * Below macros define the layout of the stub code and data
> + * which are mapped in each userspace process:
> + *  - one page of code located at 0x100000 followed by
> + *  - one page of data
> + */
>   #define STUB_START _UML_AC(, 0x100000)
>   #define STUB_CODE _UML_AC((unsigned long), STUB_START)
>   #define STUB_DATA _UML_AC((unsigned long), STUB_CODE + UM_KERN_PAGE_SIZE)
> diff --git a/arch/um/include/shared/kern_util.h 
> b/arch/um/include/shared/kern_util.h
> index 83a91f9..0282b36 100644
> --- a/arch/um/include/shared/kern_util.h
> +++ b/arch/um/include/shared/kern_util.h
> @@ -37,6 +37,7 @@ extern void initial_thread_cb(void (*proc)(void *), void 
> *arg);
>   extern int is_syscall(unsigned long addr);
>   
>   extern void timer_handler(int sig, struct siginfo *unused_si, struct 
> uml_pt_regs *regs);
> +extern void hrtimer_handler(int sig, struct siginfo *unused_si, struct 
> uml_pt_regs *regs);
>   
>   extern int start_uml(void);
>   extern void paging_init(void);
> diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
> index d824528..4eb382f 100644
> --- a/arch/um/include/shared/os.h
> +++ b/arch/um/include/shared/os.h
> @@ -217,7 +217,8 @@ extern int set_umid(char *name);
>   extern char *get_umid(void);
>   
>   /* signal.c */
> -extern void timer_init(void);
> +extern void uml_timer_set_signal_handler(void);
> +extern void uml_hrtimer_set_signal_handler(void);
>   extern void set_sigstack(void *sig_stack, int size);
>   extern void remove_sigstack(void);
>   extern void set_handler(int sig);
> @@ -238,12 +239,16 @@ extern void um_early_printk(const char *s, unsigned int 
> n);
>   extern void os_fix_helper_signals(void);
>   
>   /* time.c */
> -extern void idle_sleep(unsigned long long nsecs);
> -extern int set_interval(void);
> -extern int timer_one_shot(int ticks);
> -extern long long disable_timer(void);
> +extern void os_idle_sleep(unsigned long long nsecs);
> +extern int os_timer_create(void* timer);
> +extern int os_timer_set_interval(void* timer);
> +extern int os_timer_one_shot(int ticks);
> +extern long long os_timer_disable(void);
> +extern long os_timer_remain(void* timer);
>   extern void uml_idle_timer(void);
> +extern long long os_persistent_clock_emulation(void);
>   extern long long os_nsecs(void);
> +extern long long os_vnsecs(void);
>   
>   /* skas/mem.c */
>   extern long run_syscall_stub(struct mm_id * mm_idp,
> diff --git a/arch/um/include/shared/timer-internal.h 
> b/arch/um/include/shared/timer-internal.h
> new file mode 100644
> index 0000000..afdc6dc
> --- /dev/null
> +++ b/arch/um/include/shared/timer-internal.h
> @@ -0,0 +1,18 @@
> +/*
> + * Copyright (C) 2012 - 2014 Cisco Systems
> + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
> + * Licensed under the GPL
> + */
> +
> +#ifndef __TIMER_INTERNAL_H__
> +#define __TIMER_INTERNAL_H__
> +
> +#define TIMER_MULTIPLIER 256
> +#define TIMER_MIN_DELTA  500
> +
> +extern void timer_lock(void);
> +extern void timer_unlock(void);
> +
> +extern long long hrtimer_disable(void);
> +
> +#endif
> diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
> index 23cb935..4c1966a 100644
> --- a/arch/um/kernel/irq.c
> +++ b/arch/um/kernel/irq.c
> @@ -338,20 +338,20 @@ static struct irq_chip normal_irq_type = {
>       .irq_unmask = dummy,
>   };
>   
> -static struct irq_chip SIGVTALRM_irq_type = {
> -     .name = "SIGVTALRM",
> -     .irq_disable = dummy,
> -     .irq_enable = dummy,
> -     .irq_ack = dummy,
> -     .irq_mask = dummy,
> -     .irq_unmask = dummy,
> +static struct irq_chip SIGUSR2_irq_type = {
> +       .name = "SIGUSR2",
> +       .irq_disable = dummy,
> +       .irq_enable = dummy,
> +       .irq_ack = dummy,
> +       .irq_mask = dummy,
> +       .irq_unmask = dummy,
>   };
>   
>   void __init init_IRQ(void)
>   {
>       int i;
>   
> -     irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, 
> handle_edge_irq);
> +     irq_set_chip_and_handler(HRTIMER_IRQ, &SIGUSR2_irq_type, 
> handle_edge_irq);
>   
>       for (i = 1; i < NR_IRQS; i++)
>               irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq);
> diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
> index 68b9119..dab9c0b 100644
> --- a/arch/um/kernel/process.c
> +++ b/arch/um/kernel/process.c
> @@ -27,6 +27,7 @@
>   #include <kern_util.h>
>   #include <os.h>
>   #include <skas.h>
> +#include <timer-internal.h>
>   
>   /*
>    * This is a per-cpu array.  A processor only modifies its entry and it only
> @@ -204,8 +205,16 @@ void arch_cpu_idle(void)
>       unsigned long long nsecs;
>   
>       cpu_tasks[current_thread_info()->cpu].pid = os_getpid();
> -     nsecs = disable_timer();
> -     idle_sleep(nsecs);
> +
> +     //WHAT?
> +     /* there is no benefit whatsoever in disabling a pending
> +      * hrtimer and setting a nanowait for the same value instead
> +      * so we do timer disable + wait only for the tracing one here
> +      */
> +
> +     nsecs = os_timer_disable();
> +     os_idle_sleep(nsecs);
> +     os_timer_set_interval(NULL);
>       local_irq_enable();
>   }
>   
> diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c
> index 289771d..c1cdc2e 100644
> --- a/arch/um/kernel/skas/clone.c
> +++ b/arch/um/kernel/skas/clone.c
> @@ -35,10 +35,11 @@ stub_clone_handler(void)
>       if (err)
>               goto out;
>   
> -     err = stub_syscall3(__NR_setitimer, ITIMER_VIRTUAL,
> -                         (long) &data->timer, 0);
> -     if (err)
> -             goto out;
> +// WHY? FIXME: Switch to timer_create, timer_settime needed?!
> +//   err = stub_syscall3(__NR_setitimer, ITIMER_VIRTUAL,
> +//                       (long) &data->timer, 0);
> +//   if (err)
> +//           goto out;
>   
>       remap_stack(data->fd, data->offset);
>       goto done;
> diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
> index 117568d..a568205 100644
> --- a/arch/um/kernel/time.c
> +++ b/arch/um/kernel/time.c
> @@ -1,4 +1,5 @@
>   /*
> + * Copyright (C) 2012-2014 Cisco Systems
>    * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
>    * Licensed under the GPL
>    */
> @@ -8,32 +9,34 @@
>   #include <linux/interrupt.h>
>   #include <linux/jiffies.h>
>   #include <linux/threads.h>
> +#include <linux/spinlock.h>
>   #include <asm/irq.h>
>   #include <asm/param.h>
>   #include <kern_util.h>
>   #include <os.h>
> +#include <timer-internal.h>
>   
> -void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs 
> *regs)
> +void hrtimer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs 
> *regs)
>   {
>       unsigned long flags;
>   
>       local_irq_save(flags);
> -     do_IRQ(TIMER_IRQ, regs);
> +     do_IRQ(HRTIMER_IRQ, regs);
>       local_irq_restore(flags);
>   }
>   
> -static void itimer_set_mode(enum clock_event_mode mode,
> +static void timer_set_mode(enum clock_event_mode mode,
>                           struct clock_event_device *evt)
>   {
>       switch (mode) {
>       case CLOCK_EVT_MODE_PERIODIC:
> -             set_interval();
> +             os_timer_set_interval(NULL);
>               break;
>   
>       case CLOCK_EVT_MODE_SHUTDOWN:
>       case CLOCK_EVT_MODE_UNUSED:
>       case CLOCK_EVT_MODE_ONESHOT:
> -             disable_timer();
> +             os_timer_disable();
>               break;
>   
>       case CLOCK_EVT_MODE_RESUME:
> @@ -41,68 +44,74 @@ static void itimer_set_mode(enum clock_event_mode mode,
>       }
>   }
>   
> -static int itimer_next_event(unsigned long delta,
> +static int timer_next_event(unsigned long delta,
>                            struct clock_event_device *evt)
>   {
> -     return timer_one_shot(delta + 1);
> +     return os_timer_one_shot(delta);
>   }
>   
> -static struct clock_event_device itimer_clockevent = {
> -     .name           = "itimer",
> +static struct clock_event_device timer_clockevent = {
> +     .name           = "timer",
>       .rating         = 250,
>       .cpumask        = cpu_all_mask,
>       .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
> -     .set_mode       = itimer_set_mode,
> -     .set_next_event = itimer_next_event,
> -     .shift          = 32,
> +     .set_mode       = timer_set_mode,
> +     .set_next_event = timer_next_event,
> +     .shift          = 0,
> +     .max_delta_ns   = 0xffffffff,
> +     .min_delta_ns   = TIMER_MIN_DELTA, //microsecond resolution should be 
> enough for anyone, same as 640K RAM
>       .irq            = 0,
> +     .mult           = 1,
>   };
>   
> -static irqreturn_t um_timer(int irq, void *dev)
> +static irqreturn_t um_timer_irq(int irq, void *dev)
>   {
> -     (*itimer_clockevent.event_handler)(&itimer_clockevent);
> +     (*timer_clockevent.event_handler)(&timer_clockevent);
>   
>       return IRQ_HANDLED;
>   }
>   
> -static cycle_t itimer_read(struct clocksource *cs)
> +static cycle_t timer_read(struct clocksource *cs)
>   {
> -     return os_nsecs() / 1000;
> +     return os_nsecs() / TIMER_MULTIPLIER;
>   }
>   
> -static struct clocksource itimer_clocksource = {
> -     .name           = "itimer",
> +static struct clocksource timer_clocksource = {
> +     .name           = "timer",
>       .rating         = 300,
> -     .read           = itimer_read,
> +     .read           = timer_read,
>       .mask           = CLOCKSOURCE_MASK(64),
>       .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
>   };
>   
> -static void __init setup_itimer(void)
> +static void __init timer_setup(void)
>   {
>       int err;
>   
> -     err = request_irq(TIMER_IRQ, um_timer, 0, "timer", NULL);
> -     if (err != 0)
> +     err = request_irq(HRTIMER_IRQ, um_timer_irq, IRQF_TIMER, "hr timer", 
> NULL);
> +     if (err != 0) {
>               printk(KERN_ERR "register_timer : request_irq failed - "
>                      "errno = %d\n", -err);
> +             return;
> +    }
> +
> +    err = os_timer_create(NULL);
> +    if (err != 0) {
> +        printk(KERN_ERR "creation of timer failed - errno = %d\n", -err);
> +        return;
> +    }
>   
> -     itimer_clockevent.mult = div_sc(HZ, NSEC_PER_SEC, 32);
> -     itimer_clockevent.max_delta_ns =
> -             clockevent_delta2ns(60 * HZ, &itimer_clockevent);
> -     itimer_clockevent.min_delta_ns =
> -             clockevent_delta2ns(1, &itimer_clockevent);
> -     err = clocksource_register_hz(&itimer_clocksource, USEC_PER_SEC);
> +     err = clocksource_register_hz(&timer_clocksource, 
> NSEC_PER_SEC/TIMER_MULTIPLIER);
>       if (err) {
>               printk(KERN_ERR "clocksource_register_hz returned %d\n", err);
>               return;
>       }
> -     clockevents_register_device(&itimer_clockevent);
> +     clockevents_register_device(&timer_clockevent);
>   }
>   
>   void read_persistent_clock(struct timespec *ts)
>   {
> -     long long nsecs = os_nsecs();
> +     long long nsecs = os_persistent_clock_emulation();
>   
>       set_normalized_timespec(ts, nsecs / NSEC_PER_SEC,
>                               nsecs % NSEC_PER_SEC);
> @@ -110,6 +119,6 @@ void read_persistent_clock(struct timespec *ts)
>   
>   void __init time_init(void)
>   {
> -     timer_init();
> -     late_time_init = setup_itimer;
> +     uml_hrtimer_set_signal_handler();
> +     late_time_init = timer_setup;
>   }
> diff --git a/arch/um/os-Linux/internal.h b/arch/um/os-Linux/internal.h
> deleted file mode 100644
> index 0dc2c9f..0000000
> --- a/arch/um/os-Linux/internal.h
> +++ /dev/null
> @@ -1 +0,0 @@
> -void alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc);
> diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c
> index df9191a..bd5907e 100644
> --- a/arch/um/os-Linux/main.c
> +++ b/arch/um/os-Linux/main.c
> @@ -168,8 +168,8 @@ int __init main(int argc, char **argv, char **envp)
>        * some time) and cause a segfault.
>        */
>   
> -     /* stop timers and set SIGVTALRM to be ignored */
> -     disable_timer();
> +     /* stop timers and set timer signal to be ignored */
> +     os_timer_disable();
>   
>       /* disable SIGIO for the fds and set SIGIO to be ignored */
>       err = deactivate_all_fds();
> diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
> index 7b605e4..ee6db2e 100644
> --- a/arch/um/os-Linux/signal.c
> +++ b/arch/um/os-Linux/signal.c
> @@ -13,7 +13,6 @@
>   #include <kern_util.h>
>   #include <os.h>
>   #include <sysdep/mcontext.h>
> -#include "internal.h"
>   
>   void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = {
>       [SIGTRAP]       = relay_signal,
> @@ -23,7 +22,8 @@ void (*sig_info[NSIG])(int, struct siginfo *, struct 
> uml_pt_regs *) = {
>       [SIGBUS]        = bus_handler,
>       [SIGSEGV]       = segv_handler,
>       [SIGIO]         = sigio_handler,
> -     [SIGVTALRM]     = timer_handler };
> +     [SIGUSR2]       = hrtimer_handler
> +};
>   
>   static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
>   {
> @@ -38,7 +38,7 @@ static void sig_handler_common(int sig, struct siginfo *si, 
> mcontext_t *mc)
>       }
>   
>       /* enable signals if sig isn't IRQ signal */
> -     if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGVTALRM))
> +     if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGVTALRM) && (sig 
> != SIGUSR2))
>               unblock_signals();
>   
>       (*sig_info[sig])(sig, si, &r);
> @@ -55,8 +55,8 @@ static void sig_handler_common(int sig, struct siginfo *si, 
> mcontext_t *mc)
>   #define SIGIO_BIT 0
>   #define SIGIO_MASK (1 << SIGIO_BIT)
>   
> -#define SIGVTALRM_BIT 1
> -#define SIGVTALRM_MASK (1 << SIGVTALRM_BIT)
> +#define SIGUSR2_BIT 2
> +#define SIGUSR2_MASK (1 << SIGUSR2_BIT)
>   
>   static int signals_enabled;
>   static unsigned int signals_pending;
> @@ -78,46 +78,47 @@ void sig_handler(int sig, struct siginfo *si, mcontext_t 
> *mc)
>       set_signals(enabled);
>   }
>   
> -static void real_alarm_handler(mcontext_t *mc)
> +static void real_hralarm_handler(mcontext_t *mc)
>   {
>       struct uml_pt_regs regs;
>   
>       if (mc != NULL)
>               get_regs_from_mc(&regs, mc);
>       regs.is_user = 0;
> -     unblock_signals();
> -     timer_handler(SIGVTALRM, NULL, &regs);
> +     hrtimer_handler(SIGUSR2, NULL, &regs);
>   }
>   
> -void alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc)
> +void hralarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc)
>   {
>       int enabled;
>   
>       enabled = signals_enabled;
>       if (!signals_enabled) {
> -             signals_pending |= SIGVTALRM_MASK;
> +             signals_pending |= SIGUSR2_MASK;
>               return;
>       }
>   
>       block_signals();
> -
> -     real_alarm_handler(mc);
> +     real_hralarm_handler(mc);
>       set_signals(enabled);
>   }
>   
> -void timer_init(void)
> +void uml_hrtimer_set_signal_handler(void)
>   {
> -     set_handler(SIGVTALRM);
> +     set_handler(SIGUSR2);
>   }
>   
>   void set_sigstack(void *sig_stack, int size)
>   {
> -     stack_t stack = ((stack_t) { .ss_flags  = 0,
> -                                  .ss_sp     = (__ptr_t) sig_stack,
> -                                  .ss_size   = size - sizeof(void *) });
> +     stack_t stack = ((stack_t) {
> +                 .ss_flags = 0,
> +                             .ss_sp    = (__ptr_t) sig_stack,
> +                             .ss_size  = size - sizeof(void *)
> +     });
>   
> -     if (sigaltstack(&stack, NULL) != 0)
> +     if (sigaltstack(&stack, NULL) != 0) {
>               panic("enabling signal stack failed, errno = %d\n", errno);
> +     }
>   }
>   
>   static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) 
> = {
> @@ -129,10 +130,9 @@ static void (*handlers[_NSIG])(int sig, struct siginfo 
> *si, mcontext_t *mc) = {
>   
>       [SIGIO] = sig_handler,
>       [SIGWINCH] = sig_handler,
> -     [SIGVTALRM] = alarm_handler
> +     [SIGUSR2] = hralarm_handler
>   };
>   
> -
>   static void hard_handler(int sig, siginfo_t *si, void *p)
>   {
>       struct ucontext *uc = p;
> @@ -176,6 +176,13 @@ static void hard_handler(int sig, siginfo_t *si, void *p)
>       } while (pending);
>   }
>   
> +/**
> + * set_handler() - enable signal in process' signal mask
> + * @sig:    The signal to enable
> + *
> + * Enable the given signal in the process' signal mask and
> + * attach hard_handler() as handler routine
> + */
>   void set_handler(int sig)
>   {
>       struct sigaction action;
> @@ -186,9 +193,9 @@ void set_handler(int sig)
>   
>       /* block irq ones */
>       sigemptyset(&action.sa_mask);
> -     sigaddset(&action.sa_mask, SIGVTALRM);
>       sigaddset(&action.sa_mask, SIGIO);
>       sigaddset(&action.sa_mask, SIGWINCH);
> +     sigaddset(&action.sa_mask, SIGUSR2);
>   
>       if (sig == SIGSEGV)
>               flags |= SA_NODEFER;
> @@ -281,8 +288,8 @@ void unblock_signals(void)
>               if (save_pending & SIGIO_MASK)
>                       sig_handler_common(SIGIO, NULL, NULL);
>   
> -             if (save_pending & SIGVTALRM_MASK)
> -                     real_alarm_handler(NULL);
> +             if (save_pending & SIGUSR2_MASK)
> +                     real_hralarm_handler(NULL);
>       }
>   }
>   
> @@ -298,9 +305,11 @@ int set_signals(int enable)
>               return enable;
>   
>       ret = signals_enabled;
> -     if (enable)
> +     if (enable) {
>               unblock_signals();
> -     else block_signals();
> +     } else {
> +         block_signals();
> +    }
>   
>       return ret;
>   }
> diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
> index 7a97775..555108a 100644
> --- a/arch/um/os-Linux/skas/process.c
> +++ b/arch/um/os-Linux/skas/process.c
> @@ -45,7 +45,7 @@ static int ptrace_dump_regs(int pid)
>    * Signals that are OK to receive in the stub - we'll just continue it.
>    * SIGWINCH will happen when UML is inside a detached screen.
>    */
> -#define STUB_SIG_MASK ((1 << SIGVTALRM) | (1 << SIGWINCH))
> +#define STUB_SIG_MASK ((1 << SIGVTALRM) | (1 << SIGWINCH) | (1 << SIGUSR2))
>   
>   /* Signals that the stub will finish with - anything else is an error */
>   #define STUB_DONE_MASK (1 << SIGTRAP)
> @@ -176,17 +176,57 @@ static void handle_trap(int pid, struct uml_pt_regs 
> *regs,
>   
>   extern int __syscall_stub_start;
>   
> +/**
> + * userspace_tramp() - userspace trampoline
> + * @stack:  The address of the stack used for the new process (used for
> + *          SIGSEGV handling.
> + *
> + * The trampoline does execute as a new process after clone()
> + * For each new userspace process the below code sets up
> + * all necessary data:
> + * 1.) enable ptrace from parent (the uml kernel)
> + * 2.) Setup signal handling. Signals are inherited by the parent, i.e
> + *     the uml kernel
> + * 3.) Create and start an posix (interval) timer for this process.
> + *     This timer will emulate the kernel timer ticks.
> + *     The timer signal will be processed by the kernel process in 
> userspace()
> + * 4.) Map stub code page in the new process, i.e. the
> + *     userspace process:
> + *     The stub codes is used to catch syscalls from the userspace to
> + *     the kernel.
> + *     See linker scripts arch/um/kernel/dyn.lds.S (dynamic) resp.
> + *                        arch/um/kernel/uml.lds.S (static)
> + *     for __syscall_stub_start defintion and
> + *     arch/um/kernel/skas/clone.c for the stub_handler itself.
> + * 5.) Map stub data page in the new process, i.e. the
> + *     userspace process:
> + *     Setup an SIGSEGV handler into the new process.
> + *     Page faults will be catched and signaled to the kernel via this
> + *     mechanism.
> + *     See arch/x86/um/stub_segv.c for the handler itself.
> + * 6.) Stop the new process and wait for the kernel to SIGCONT it agian
> + *     when it will get scheduled()
> + */
>   static int userspace_tramp(void *stack)
>   {
>       void *addr;
>       int err, fd;
>       unsigned long long offset;
> +     timer_t timer;
>   
>       ptrace(PTRACE_TRACEME, 0, 0, 0);
>   
>       signal(SIGTERM, SIG_DFL);
>       signal(SIGWINCH, SIG_IGN);
> -     err = set_interval();
> +
> +     err = os_timer_create(&timer);
> +     if (err) {
> +             printk(UM_KERN_ERR "userspace_tramp - creation of timer failed, 
> "
> +                    "errno = %d\n", err);
> +             exit(1);
> +     }
> +
> +     err = os_timer_set_interval(&timer);
>       if (err) {
>               printk(UM_KERN_ERR "userspace_tramp - setting timer failed, "
>                      "errno = %d\n", err);
> @@ -313,10 +353,16 @@ int start_userspace(unsigned long stub_stack)
>       return err;
>   }
>   
> +/**
> + * userspace() - user space control loop
> + * @regs: ?
> + *
> + * The main loop that traces and controls each spwaned userspace
> + * process, i.e.
> + */
>   void userspace(struct uml_pt_regs *regs)
>   {
> -     struct itimerval timer;
> -     unsigned long long nsecs, now;
> +     unsigned long long nsecs;
>       int err, status, op, pid = userspace_pid[0];
>       /* To prevent races if using_sysemu changes under us.*/
>       int local_using_sysemu;
> @@ -325,13 +371,11 @@ void userspace(struct uml_pt_regs *regs)
>       /* Handle any immediate reschedules or signals */
>       interrupt_end();
>   
> -     if (getitimer(ITIMER_VIRTUAL, &timer))
> -             printk(UM_KERN_ERR "Failed to get itimer, errno = %d\n", errno);
> -     nsecs = timer.it_value.tv_sec * UM_NSEC_PER_SEC +
> -             timer.it_value.tv_usec * UM_NSEC_PER_USEC;
> -     nsecs += os_nsecs();
> -
>       while (1) {
> +
> +             nsecs = os_timer_remain(NULL);
> +         nsecs += os_nsecs();
> +
>               /*
>                * This can legitimately fail if the process loads a
>                * bogus value into a segment register.  It will
> @@ -388,31 +432,27 @@ void userspace(struct uml_pt_regs *regs)
>                       switch (sig) {
>                       case SIGSEGV:
>                               if (PTRACE_FULL_FAULTINFO) {
> -                                     get_skas_faultinfo(pid,
> -                                                        &regs->faultinfo);
> -                                     (*sig_info[SIGSEGV])(SIGSEGV, (struct 
> siginfo *)&si,
> -                                                          regs);
> +                                     
> get_skas_faultinfo(pid,&regs->faultinfo);
> +                                     (*sig_info[SIGSEGV])(SIGSEGV, (struct 
> siginfo *)&si, regs);
> +                             } else {
> +                                     handle_segv(pid, regs);
>                               }
> -                             else handle_segv(pid, regs);
>                               break;
>                       case SIGTRAP + 0x80:
> -                             handle_trap(pid, regs, local_using_sysemu);
> +                             handle_trap(pid, regs, local_using_sysemu);
>                               break;
>                       case SIGTRAP:
>                               relay_signal(SIGTRAP, (struct siginfo *)&si, 
> regs);
>                               break;
> -                     case SIGVTALRM:
> -                             now = os_nsecs();
> -                             if (now < nsecs)
> +                     case SIGUSR2:
> +                             /* only process the timer tick from userspace, 
> if the kernel
> +                              * timer is not finished yet */
> +                             if (nsecs < os_nsecs()) {
>                                       break;
> +                             }
>                               block_signals();
>                               (*sig_info[sig])(sig, (struct siginfo *)&si, 
> regs);
>                               unblock_signals();
> -                             nsecs = timer.it_value.tv_sec *
> -                                     UM_NSEC_PER_SEC +
> -                                     timer.it_value.tv_usec *
> -                                     UM_NSEC_PER_USEC;
> -                             nsecs += os_nsecs();
>                               break;
>                       case SIGIO:
>                       case SIGILL:
> @@ -448,8 +488,7 @@ static int __init init_thread_regs(void)
>       thread_regs[REGS_IP_INDEX] = STUB_CODE +
>                               (unsigned long) stub_clone_handler -
>                               (unsigned long) &__syscall_stub_start;
> -     thread_regs[REGS_SP_INDEX] = STUB_DATA + UM_KERN_PAGE_SIZE -
> -             sizeof(void *);
> +     thread_regs[REGS_SP_INDEX] = STUB_DATA + UM_KERN_PAGE_SIZE - 
> sizeof(void *);
>   #ifdef __SIGNAL_FRAMESIZE
>       thread_regs[REGS_SP_INDEX] -= __SIGNAL_FRAMESIZE;
>   #endif
> @@ -460,23 +499,26 @@ __initcall(init_thread_regs);
>   
>   int copy_context_skas0(unsigned long new_stack, int pid)
>   {
> -     struct timeval tv = { .tv_sec = 0, .tv_usec = UM_USEC_PER_SEC / UM_HZ };
> +     struct timeval tv = { .tv_sec = 0, .tv_usec = UM_NSEC_PER_SEC / UM_HZ };
>       int err;
>       unsigned long current_stack = current_stub_stack();
>       struct stub_data *data = (struct stub_data *) current_stack;
>       struct stub_data *child_data = (struct stub_data *) new_stack;
>       unsigned long long new_offset;
> +
>       int new_fd = phys_mapping(to_phys((void *)new_stack), &new_offset);
>   
>       /*
>        * prepare offset and fd of child's stack as argument for parent's
>        * and child's mmap2 calls
>        */
> -     *data = ((struct stub_data) { .offset   = MMAP_OFFSET(new_offset),
> -                                   .fd       = new_fd,
> -                                   .timer    = ((struct itimerval)
> -                                                { .it_value = tv,
> -                                                  .it_interval = tv }) });
> +     *data = ((struct stub_data) {
> +                     .offset = MMAP_OFFSET(new_offset),
> +                     .fd     = new_fd,
> +                     .timer  = ((struct itimerval)
> +                                          { .it_value    = tv,
> +                                            .it_interval = tv })
> +     });
>   
>       err = ptrace_setregs(pid, thread_regs);
>       if (err < 0) {
> diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c
> index e9824d5..d66d3b6 100644
> --- a/arch/um/os-Linux/time.c
> +++ b/arch/um/os-Linux/time.c
> @@ -1,4 +1,5 @@
>   /*
> + * Copyright (C) 2012-2014 Cisco Systems
>    * Copyright (C) 2000 - 2007 Jeff Dike (jdike{addtoit,linux.intel}.com)
>    * Licensed under the GPL
>    */
> @@ -10,73 +11,155 @@
>   #include <sys/time.h>
>   #include <kern_util.h>
>   #include <os.h>
> -#include "internal.h"
> +#include <string.h>
> +#include <timer-internal.h>
>   
> -int set_interval(void)
> +static timer_t event_high_res_timer = 0;
> +
> +static inline long long timeval_to_ns(const struct timeval *tv)
>   {
> -     int usec = UM_USEC_PER_SEC / UM_HZ;
> -     struct itimerval interval = ((struct itimerval) { { 0, usec },
> -                                                       { 0, usec } });
> +     return ((long long) tv->tv_sec * UM_NSEC_PER_SEC) +
> +             tv->tv_usec * UM_NSEC_PER_USEC;
> +}
>   
> -     if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1)
> -             return -errno;
> +static inline long long timespec_to_ns(const struct timespec *ts)
> +{
> +     return ((long long) ts->tv_sec * UM_NSEC_PER_SEC) +
> +             ts->tv_nsec;
> +}
> +
> +long long os_persistent_clock_emulation (void) {
> +     struct timespec realtime_tp;
> +
> +     clock_gettime(CLOCK_REALTIME, &realtime_tp);
> +     return timespec_to_ns(&realtime_tp);
> +}
> +
> +/**
> + * os_timer_create() - create an new posix (interval) timer
> + */
> +int os_timer_create(void* timer) {
> +
> +     struct sigevent sev;
> +     timer_t* t = timer;
>   
> +     if(t == NULL) {
> +             t = &event_high_res_timer;
> +     }
> +
> +     sev.sigev_notify = SIGEV_SIGNAL;
> +     sev.sigev_signo = SIGUSR2; /* note - hrtimer now has its own signal */
> +     sev.sigev_value.sival_ptr = &event_high_res_timer;
> +
> +     if (timer_create(
> +             CLOCK_MONOTONIC,
> +             &sev,
> +             t) == -1) {
> +//        printk("Failed to create Timer");
> +             return -1;
> +     }
> +//   printk("Event timer ID is 0x%lx\n", (long) *t);
>       return 0;
>   }
>   
> -int timer_one_shot(int ticks)
> +int os_timer_set_interval(void* timer)
>   {
> -     unsigned long usec = ticks * UM_USEC_PER_SEC / UM_HZ;
> -     unsigned long sec = usec / UM_USEC_PER_SEC;
> -     struct itimerval interval;
> +     struct itimerspec its;
> +     unsigned long long nsec;
> +     timer_t* t = timer;
> +
> +     if(t == NULL) {
> +             t = &event_high_res_timer;
> +     }
>   
> -     usec %= UM_USEC_PER_SEC;
> -     interval = ((struct itimerval) { { 0, 0 }, { sec, usec } });
> +     nsec = UM_NSEC_PER_SEC / UM_HZ;
>   
> -     if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1)
> +     its.it_value.tv_sec = 0;
> +     its.it_value.tv_nsec = nsec;
> +
> +     its.it_interval.tv_sec = 0;
> +     its.it_interval.tv_nsec = nsec;
> +
> +     if(timer_settime(*t, 0, &its, NULL) == -1) {
>               return -errno;
> +     }
>   
>       return 0;
>   }
>   
>   /**
> - * timeval_to_ns - Convert timeval to nanoseconds
> - * @ts:              pointer to the timeval variable to be converted
> - *
> - * Returns the scalar nanosecond representation of the timeval
> - * parameter.
> - *
> - * Ripped from linux/time.h because it's a kernel header, and thus
> - * unusable from here.
> + * os_timer_remain() - returns the remaining nano seconds of the given 
> interval
> + *                     timer
> + * Because this is the remaining time of an interval timer, which 
> correspondends
> + * to HZ, this value can never be bigger than one second. Just
> + * the nanosecond part of the timer is returned.
> + * The returned time is relative to the start time of the interval timer.
> + * Return an negative value in an error case.
>    */
> -static inline long long timeval_to_ns(const struct timeval *tv)
> +long os_timer_remain(void* timer)
>   {
> -     return ((long long) tv->tv_sec * UM_NSEC_PER_SEC) +
> -             tv->tv_usec * UM_NSEC_PER_USEC;
> +     struct itimerspec its;
> +     timer_t* t = timer;
> +
> +     if(t == NULL) {
> +             t = &event_high_res_timer;
> +     }
> +
> +     if(timer_gettime(t, &its) == -1) {
> +             return -errno;
> +     }
> +
> +     return its.it_value.tv_nsec;
> +}
> +
> +int os_timer_one_shot(int ticks)
> +{
> +     struct itimerspec its;
> +     unsigned long long nsec;
> +     unsigned long sec;
> +
> +    nsec = (ticks + 1);
> +    sec = nsec / UM_NSEC_PER_SEC;
> +     nsec = nsec % UM_NSEC_PER_SEC;
> +
> +     its.it_value.tv_sec = nsec / UM_NSEC_PER_SEC;
> +     its.it_value.tv_nsec = nsec;
> +
> +     its.it_interval.tv_sec = 0;
> +     its.it_interval.tv_nsec = 0; // we cheat here
> +
> +     timer_settime(event_high_res_timer, 0, &its, NULL);
> +     return 0;
>   }
>   
> -long long disable_timer(void)
> +/**
> + * os_timer_disable() - disable the posix (interval) timer
> + * Returns the remaining interval timer time in nanoseconds
> + */
> +long long os_timer_disable(void)
>   {
> -     struct itimerval time = ((struct itimerval) { { 0, 0 }, { 0, 0 } });
> -     long long remain, max = UM_NSEC_PER_SEC / UM_HZ;
> +     struct itimerspec its;
>   
> -     if (setitimer(ITIMER_VIRTUAL, &time, &time) < 0)
> -             printk(UM_KERN_ERR "disable_timer - setitimer failed, "
> -                    "errno = %d\n", errno);
> +     memset(&its, 0, sizeof(struct itimerspec));
> +     timer_settime(event_high_res_timer, 0, &its, &its);
> +
> +     return its.it_value.tv_sec * UM_NSEC_PER_SEC + its.it_value.tv_nsec;
> +}
>   
> -     remain = timeval_to_ns(&time.it_value);
> -     if (remain > max)
> -             remain = max;
> +long long os_vnsecs(void)
> +{
> +     struct timespec ts;
>   
> -     return remain;
> +     clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&ts);
> +     return timespec_to_ns(&ts);
>   }
>   
>   long long os_nsecs(void)
>   {
> -     struct timeval tv;
> +     struct timespec ts;
>   
> -     gettimeofday(&tv, NULL);
> -     return timeval_to_ns(&tv);
> +     clock_gettime(CLOCK_MONOTONIC,&ts);
> +     return timespec_to_ns(&ts);
>   }
>   
>   #ifdef UML_CONFIG_NO_HZ_COMMON
> @@ -87,12 +170,7 @@ static int after_sleep_interval(struct timespec *ts)
>   
>   static void deliver_alarm(void)
>   {
> -     alarm_handler(SIGVTALRM, NULL, NULL);
> -}
> -
> -static unsigned long long sleep_time(unsigned long long nsecs)
> -{
> -     return nsecs;
> +//   alarm_handler(SIGVTALRM, NULL, NULL);
>   }
>   
>   #else
> @@ -102,14 +180,17 @@ unsigned long long skew;
>   static void deliver_alarm(void)
>   {
>       unsigned long long this_tick = os_nsecs();
> +     //FIXME: int okay?
>       int one_tick = UM_NSEC_PER_SEC / UM_HZ;
>   
>       /* Protection against the host's time going backwards */
> -     if ((last_tick != 0) && (this_tick < last_tick))
> +     if ((last_tick != 0) && (this_tick < last_tick)) {
>               this_tick = last_tick;
> +     }
>   
> -     if (last_tick == 0)
> +     if (last_tick == 0) {
>               last_tick = this_tick - one_tick;
> +     }
>   
>       skew += this_tick - last_tick;
>   
> @@ -132,7 +213,7 @@ static inline long long timespec_to_us(const struct 
> timespec *ts)
>               ts->tv_nsec / UM_NSEC_PER_USEC;
>   }
>   
> -static int after_sleep_interval(struct timespec *ts)
> +static int timer_after_sleep_interval(struct timespec *ts)
>   {
>       int usec = UM_USEC_PER_SEC / UM_HZ;
>       long long start_usecs = timespec_to_us(ts);
> @@ -146,15 +227,16 @@ static int after_sleep_interval(struct timespec *ts)
>        * tick interval.  If this happens, then just reduce the first
>        * tick to the interval value.
>        */
> -     if (start_usecs > usec)
> +     if (start_usecs > usec) {
>               start_usecs = usec;
> +     }
>   
>       start_usecs -= skew / UM_NSEC_PER_USEC;
>       if (start_usecs < 0)
>               start_usecs = 0;
>   
>       tv = ((struct timeval) { .tv_sec  = start_usecs / UM_USEC_PER_SEC,
> -                              .tv_usec = start_usecs % UM_USEC_PER_SEC });
> +                             .tv_usec = start_usecs % UM_USEC_PER_SEC });
>       interval = ((struct itimerval) { { 0, usec }, tv });
>   
>       if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1)
> @@ -164,23 +246,33 @@ static int after_sleep_interval(struct timespec *ts)
>   }
>   #endif
>   
> -void idle_sleep(unsigned long long nsecs)
> +/**
> + * os_idle_sleep() - sleep for a given time of nsecs
> + * @nsecs: nanoseconds to sleep
> + */
> +void os_idle_sleep(unsigned long long nsecs)
>   {
>       struct timespec ts;
>   
>       /*
> -      * nsecs can come in as zero, in which case, this starts a
> -      * busy loop.  To prevent this, reset nsecs to the tick
> -      * interval if it is zero.
> +      *   We sleep here for an interval that is not greater than HZ
> +      *   We did not disable the timer in "disable" so if there is a timer
> +      *   active it will wake us up right on time instead of doing
> +      *   stupid things trying to program nanosleep in a race condition
> +      *   manner.
>        */
> -     if (nsecs == 0)
> -             nsecs = UM_NSEC_PER_SEC / UM_HZ;
>   
> -     nsecs = sleep_time(nsecs);
> -     ts = ((struct timespec) { .tv_sec       = nsecs / UM_NSEC_PER_SEC,
> -                               .tv_nsec      = nsecs % UM_NSEC_PER_SEC });
> +      if ((nsecs == 0) || (nsecs > UM_NSEC_PER_SEC / UM_HZ)) {
> +     nsecs = UM_NSEC_PER_SEC / UM_HZ ;
> +    }
>   
> -     if (nanosleep(&ts, &ts) == 0)
> +     ts = ((struct timespec) {
> +                     .tv_sec  = 0,
> +                     .tv_nsec = nsecs
> +     });
> +
> +     if (clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, &ts) == 0) {
>               deliver_alarm();
> -     after_sleep_interval(&ts);
> +     }
> +//   after_sleep_interval(&ts);
>   }
>
>
>


------------------------------------------------------------------------------
One dashboard for servers and applications across Physical-Virtual-Cloud 
Widest out-of-the-box monitoring support with 50+ applications
Performance metrics, stats and reports that give you Actionable Insights
Deep dive visibility with transaction tracing using APM Insight.
http://ad.doubleclick.net/ddm/clk/290420510;117567292;y
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel

Reply via email to