[PATCH] kernel/time: Feedback reply for hr_sleep syscall, a fine-grained sleep service

2021-04-07 Thread Marco Faltelli
Current sleep services (nanosleep) provide sleep periods very far from the 
expectations when scheuling microsecond-scale timers. On our testbed, using 
rdtscp() before and after a nanosleep() syscall to measure the effective 
elapsed time with a 1us timer, we got ~59us.
Even with larger timeout periods, the difference is still evident (e.g., with a 
100us timer, we measured ~158us of elapsed time).
We believe that one of the reasons is the use of the timespec structure, that 
needs to be copied for user to kernel and then converted into a single-value 
representation.
In our work Metronome (https://dl.acm.org/doi/pdf/10.1145/3386367.3432730) we 
had the need for a precise microsecond-granularity sleep service, as 
nanosleep() was far from our needs, so we developed hr_sleep(), a new sleep 
service. Since the sleep periods needed in our case are small, we don't want 
our sleep service to re-schedule a timer in case of a signal interruption, so 
it just returns -EINTR to the user. The user must be aware that this is a 
best-effort sleep service, so the sleep period specified is an upper-bound of 
the effective elapsed time.
We believe this patch can be useful in applications where fine-grained 
granularity is requested for small sleep periods, and re-scheduling the timer 
in case of a signal is not mandatory.
In the paper previously linked, Section 3.1 provides more details about 
hr_sleep and Section 3.3 extensively evaluates hr_sleep() and compares it to 
nanosleep(). For a 1us timeout, hr_sleep() elapses ~3.8us in mean vs. the ~59us 
of nanosleep().
hr_sleep has been previously submitted at 
https://lore.kernel.org/lkml/20210115180733.5663-1-marco.falte...@uniroma2.it/.
This commit answers to the previous feedback in 
https://lore.kernel.org/lkml/CALCETrWfnL=3m3nmmhs-a3si5jptsctf6cethvtsdnwa5mh...@mail.gmail.com/
 and applies the requested changes.

Signed-off-by: Marco Faltelli 
---
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 kernel/time/hrtimer.c  | 67 ++
 2 files changed, 68 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index 7bf01cbe582f..85b14dfa40fb 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -364,6 +364,7 @@
 440common  process_madvise sys_process_madvise
 441common  epoll_pwait2sys_epoll_pwait2
 442common  mount_setattr   sys_mount_setattr
+443common  hr_sleepsys_hr_sleep
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 4a66725b1d4a..887c01392e08 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2006,6 +2006,73 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 
__user *, rqtp,
 }
 #endif
 
+
+
+#ifdef CONFIG_64BIT
+
+
+struct control_record {
+   struct task_struct *task;
+   int awake;
+   struct hrtimer hr_timer;
+};
+
+
+static enum hrtimer_restart hr_sleep_callback(struct hrtimer *timer)
+{
+   struct control_record *control;
+   struct task_struct *the_task;
+
+   control = (control_record *)container_of(timer, control_record, 
hr_timer);
+   control->awake = 1;
+   the_task = control->task;
+   wake_up_process(the_task);
+
+   return HRTIMER_NORESTART;
+}
+
+
+
+/**
+ * hr_sleep - a high-resolution sleep service for fine-grained timeouts
+ * @nanoseconds:   the requested sleep period in nanoseconds
+ *
+ * Returns:
+ * 0 when the sleep request successfully terminated
+ * -EINVAL if a sleep period < 0 is requested
+ * -EINTR if a signal interrupted the calling thread
+ */
+SYSCALL_DEFINE1(hr_sleep, long, nanoseconds)
+{
+   DECLARE_WAIT_QUEUE_HEAD(the_queue);
+   struct control_record control;
+   ktime_t ktime_interval;
+   struct restart_block *restart;
+
+   if (nanoseconds < 0)
+   return -EINVAL;
+
+   if (nanoseconds == 0)
+   return 0;
+
+   ktime_interval = ktime_set(0, nanoseconds);
+   hrtimer_init(&(control.hr_timer), CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+   control.hr_timer.function = _sleep_callback;
+   control.task = current;
+   control.awake = 0;
+   hrtimer_start(&(control.hr_timer), ktime_interval, HRTIMER_MODE_REL);
+   wait_event_interruptible(the_queue, control.awake == 1);
+   hrtimer_cancel(&(control.hr_timer));
+   if (control.awake == 0)
+   //We have been interrupted by a signal
+   return -EINTR;
+   return 0;
+
+}
+
+#endif
+
+
 /*
  * Functions related to boot-time initialization:
  */
-- 
2.25.1



[PATCH] kernel/time: Add hr_sleep syscall, a high-resolution sleep service

2021-01-15 Thread Marco Faltelli
hr_sleep is a new system call engineered for nanosecond time scale
granularities.
With respect to nanosleep, it uses a single value representation
of the sleep period.
hr_sleep achieves 15x improvement for microsecond scale timers
w.r.t. nanosleep: the reason is the use of a CPU register for
passing the sleep period (avoiding cross-ring data move) and
the use of the thread's kernel stack area (avoiding in-kernel
memory allocations).
Further details about hr_sleep and the evaluation compared
to nanosleep can be found in Section 3 of our paper "Metronome:
adaptive and precise intermittent packet retrieval in DPDK"
hr_sleep in this patch has syscall number 442, so you can try it
calling syscall(442, sleep_period)

Signed-off-by: Marco Faltelli 
---
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 kernel/time/hrtimer.c  | 61 ++
 2 files changed, 62 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index 78672124d28b..27343c016e42 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -363,6 +363,7 @@
 439common  faccessat2  sys_faccessat2
 440common  process_madvise sys_process_madvise
 441common  epoll_pwait2sys_epoll_pwait2
+442common  hr_sleepsys_hr_sleep
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 743c852e10f2..422410c60a9f 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1988,6 +1988,67 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 
__user *, rqtp,
 }
 #endif
 
+#ifdef CONFIG_64BIT
+
+
+typedef struct _control_record {
+   struct task_struct *task;
+   int pid;
+   int awake;
+   struct hrtimer hr_timer;
+} control_record;
+
+
+static enum hrtimer_restart hr_sleep_callback(struct hrtimer *timer)
+{
+   control_record *control;
+   struct task_struct *the_task;
+
+   control = (control_record *)container_of(timer, control_record, 
hr_timer);
+   control->awake = 1;
+   the_task = control->task;
+   wake_up_process(the_task);
+
+   return HRTIMER_NORESTART;
+}
+
+/**
+ * hr_sleep - a high-resolution sleep service for fine-grained timeouts
+ * @nanoseconds:   the requested sleep period in nanoseconds
+ *
+ * Returns:
+ * 0 when the sleep request successfully terminated
+ * -EINVAL if a sleep period < 0 is requested
+ */
+SYSCALL_DEFINE1(hr_sleep, long, nanoseconds)
+{
+   DECLARE_WAIT_QUEUE_HEAD(the_queue);//here we use a private queue
+   control_record *control;
+   ktime_t ktime_interval;
+
+   if (nanoseconds < 0)
+   return -EINVAL;
+
+   if (nanoseconds == 0)
+   return 0;
+
+   ktime_interval = ktime_set(0, nanoseconds);
+   control = (control_record *)((void *) current->stack + sizeof(struct 
thread_info));
+   hrtimer_init(&(control->hr_timer), CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+   control->hr_timer.function = _sleep_callback;
+   control->task = current;
+   control->pid  = control->task->pid; //current->pid is more costly
+   control->awake = 0;
+   hrtimer_start(&(control->hr_timer), ktime_interval, HRTIMER_MODE_REL);
+   wait_event_interruptible(the_queue, control->awake == 1);
+   hrtimer_cancel(&(control->hr_timer));
+
+   return 0;
+
+}
+
+#endif
+
 /*
  * Functions related to boot-time initialization:
  */
-- 
2.25.1