Instead of using get_user_pages_fast() and kmap_atomic() when writing
to the trace_marker file, just allocate enough space on the ring buffer
directly, and write into it via copy_from_user().

Writing into the trace_marker file use to allocate a temporary buffer
to perform the copy_from_user(), as we didn't want to write into the
ring buffer if the copy failed. But as a trace_marker write is suppose
to be extremely fast, and allocating memory causes other tracepoints to
trigger, Peter Zijlstra suggested using get_user_pages_fast() and
kmap_atomic() to keep the user space pages in memory and reading it
directly.

Instead, just allocate the space in the ring buffer and use
copy_from_user() directly. If it faults, return -EFAULT and write
"<faulted>" into the ring buffer.

On architectures without a arch-specific get_user_pages_fast(), this
will end up in the generic get_user_pages_fast() and this grabs
mm->mmap_sem. Once you do this, then suddenly writing to the
trace_marker can cause priority-inversions.

This is a backport of Steven Rostedts patch [1] and applied to 3.10.x so the
signed-off-chain by is somewhat uncertain at this stage.

The patch compiles, boots and does not immediately explode on impact. By
definition [2] it must therefore be perfect

2) https://www.spinics.net/lists/kernel/msg2400769.html
2) http://lkml.iu.edu/hypermail/linux/kernel/9804.1/0149.html

Cc: Ingo Molnar <mi...@kernel.org>
Cc: Henrik Austad <hen...@austad.us>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Steven Rostedt <rost...@goodmis.org>
Cc: sta...@vger.kernel.org

Suggested-by: Thomas Gleixner <t...@linutronix.de>
Used-to-be-signed-off-by: Steven Rostedt <rost...@goodmis.org>
Backported-by: Henrik Austad <haus...@cisco.com>
Tested-by: Henrik Austad <haus...@cisco.com>
Signed-off-by: Henrik Austad <haus...@cisco.com>
---
 kernel/trace/trace.c | 78 +++++++++++++++-------------------------------------
 1 file changed, 22 insertions(+), 56 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 18cdf91..94eb1ee 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4501,15 +4501,13 @@ tracing_mark_write(struct file *filp, const char __user 
*ubuf,
        struct ring_buffer *buffer;
        struct print_entry *entry;
        unsigned long irq_flags;
-       struct page *pages[2];
-       void *map_page[2];
-       int nr_pages = 1;
+       const char faulted[] = "<faulted>";
        ssize_t written;
-       int offset;
        int size;
        int len;
-       int ret;
-       int i;
+
+/* Used in tracing_mark_raw_write() as well */
+#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */
 
        if (tracing_disabled)
                return -EINVAL;
@@ -4520,60 +4518,34 @@ tracing_mark_write(struct file *filp, const char __user 
*ubuf,
        if (cnt > TRACE_BUF_SIZE)
                cnt = TRACE_BUF_SIZE;
 
-       /*
-        * Userspace is injecting traces into the kernel trace buffer.
-        * We want to be as non intrusive as possible.
-        * To do so, we do not want to allocate any special buffers
-        * or take any locks, but instead write the userspace data
-        * straight into the ring buffer.
-        *
-        * First we need to pin the userspace buffer into memory,
-        * which, most likely it is, because it just referenced it.
-        * But there's no guarantee that it is. By using get_user_pages_fast()
-        * and kmap_atomic/kunmap_atomic() we can get access to the
-        * pages directly. We then write the data directly into the
-        * ring buffer.
-        */
        BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
 
-       /* check if we cross pages */
-       if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
-               nr_pages = 2;
-
-       offset = addr & (PAGE_SIZE - 1);
-       addr &= PAGE_MASK;
-
-       ret = get_user_pages_fast(addr, nr_pages, 0, pages);
-       if (ret < nr_pages) {
-               while (--ret >= 0)
-                       put_page(pages[ret]);
-               written = -EFAULT;
-               goto out;
-       }
+       local_save_flags(irq_flags);
+       size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
 
-       for (i = 0; i < nr_pages; i++)
-               map_page[i] = kmap_atomic(pages[i]);
+       /* If less than "<faulted>", then make sure we can still add that */
+       if (cnt < FAULTED_SIZE)
+               size += FAULTED_SIZE - cnt;
 
-       local_save_flags(irq_flags);
-       size = sizeof(*entry) + cnt + 2; /* possible \n added */
        buffer = tr->trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
                                          irq_flags, preempt_count());
-       if (!event) {
-               /* Ring buffer disabled, return as if not open for write */
-               written = -EBADF;
-               goto out_unlock;
-       }
+
+       if (unlikely(!event))
+               /* Ring buffer disabled, return as if not open for write */
+               return -EBADF;
 
        entry = ring_buffer_event_data(event);
        entry->ip = _THIS_IP_;
 
-       if (nr_pages == 2) {
-               len = PAGE_SIZE - offset;
-               memcpy(&entry->buf, map_page[0] + offset, len);
-               memcpy(&entry->buf[len], map_page[1], cnt - len);
+       len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
+       if (len) {
+               memcpy(&entry->buf, faulted, FAULTED_SIZE);
+               cnt = FAULTED_SIZE;
+               written = -EFAULT;
        } else
-               memcpy(&entry->buf, map_page[0] + offset, cnt);
+               written = cnt;
+       len = cnt;
 
        if (entry->buf[cnt - 1] != '\n') {
                entry->buf[cnt] = '\n';
@@ -4583,16 +4555,10 @@ tracing_mark_write(struct file *filp, const char __user 
*ubuf,
 
        __buffer_unlock_commit(buffer, event);
 
-       written = cnt;
 
-       *fpos += written;
+       if (written > 0)
+               *fpos += written;
 
- out_unlock:
-       for (i = 0; i < nr_pages; i++){
-               kunmap_atomic(map_page[i]);
-               put_page(pages[i]);
-       }
- out:
        return written;
 }
 
-- 
2.7.4

Reply via email to