Currently, we use PTRACE_PEEKDATA to read things like filenames and
data passed by I/O syscalls.
PTRACE_PEEKDATA gets one word per syscall. This is VERY expensive.
For example, in order to print fstat syscall, we need to perform this:

write(3, "fstat64(1, ", 11)             = 11
ptrace(PTRACE_SYSCALL, 3983, 0x1, SIG_0) = 0
--- {si_signo=SIGCHLD, si_code=CLD_TRAPPED, si_pid=3983, si_status=SIGTRAP, 
si_utime=0, si_stime=0} (Child exited) ---
wait4(-1, [{WIFSTOPPED(s) && WSTOPSIG(s) == 133}], __WALL, NULL) = 3983
ptrace(PTRACE_GETREGS, 3983, 0, 0x80885a0) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ecc, [0xa]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ed0, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ed4, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ed8, [0x3]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1edc, [0x2190]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ee0, [0x1]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ee4, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ee8, [0x5]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1eec, [0x8800]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ef0, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ef4, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1ef8, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1efc, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f00, [0x400]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f04, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f08, [0]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f0c, [0x4f192228]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f10, [0x18f0e6ba]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f14, [0x4f194c1f]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f18, [0x1b365bd4]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f1c, [0x4f192153]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f20, [0x246674e9]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f24, [0x3]) = 0
ptrace(PTRACE_PEEKDATA, 3983, 0xbf8b1f28, [0]) = 0
write(3, "{st_mode=S_IFCHR|0620, st_rdev=m"..., 58) = 58

Twenty four trips into kernel to fetch one struct stat!

Kernel just got a new syscall, process_vm_readv(), which can be used to
copy data out of process' address space. Looks like it's ideal
for this goal.

This is a patch which uses process_vm_readv() in umoven() and umovestr()
functions if possible, with fallback to old method if process_vm_readv()
returns ENOSYS.

There is a slight change in API: since I read data in blocks of 256 bytes,
now umovestr() may overwrite data in the buffer past terminating NUL.
One caller used to depend on guarantee that this never happens, but
I just committed a small patch which reworks that place to remove
this assumption.

I don't have a kernel where process_vm_readv() syscall actually works,
so the patch definitely needs to wait until it can be tested for real.
So far I only tested that on ENOSYS it indeed falls back to old code
and everything continues to work :)

What do you guys think about it?

-- 
vda


diff -d -urpN strace.5/linux/i386/syscallent.h strace.6/linux/i386/syscallent.h
--- strace.5/linux/i386/syscallent.h    2012-01-04 15:09:05.000000000 +0100
+++ strace.6/linux/i386/syscallent.h    2012-01-20 12:01:46.824097896 +0100
@@ -377,8 +377,8 @@
        { 1,    TD,     sys_syncfs,             "syncfs"        }, /* 344 */
        { 4,    TN,     sys_sendmmsg,           "sendmmsg"      }, /* 345 */
        { 2,    TD,     sys_setns,              "setns"         }, /* 346 */
-       { 5,    0,      printargs,              "SYS_347"       }, /* 347 */
-       { 5,    0,      printargs,              "SYS_348"       }, /* 348 */
+       { 6,    0,      printargs,              "process_vm_readv"      }, /* 
347 */
+       { 6,    0,      printargs,              "process_vm_writev"     }, /* 
348 */
        { 5,    0,      printargs,              "SYS_349"       }, /* 349 */
        { 5,    0,      printargs,              "SYS_350"       }, /* 350 */
        { 5,    0,      printargs,              "SYS_351"       }, /* 351 */
diff -d -urpN strace.5/linux/powerpc/syscallent.h 
strace.6/linux/powerpc/syscallent.h
--- strace.5/linux/powerpc/syscallent.h 2012-01-10 16:48:13.000000000 +0100
+++ strace.6/linux/powerpc/syscallent.h 2012-01-20 12:01:46.824097896 +0100
@@ -379,8 +379,8 @@
        { 1,    TD,     sys_syncfs,             "syncfs"                }, /* 
348 */
        { 4,    TN,     sys_sendmmsg,           "sendmmsg"              }, /* 
349 */
        { 2,    TD,     sys_setns,              "setns"                 }, /* 
350 */
-       { 5,    0,      printargs,              "SYS_351"               }, /* 
351 */
-       { 5,    0,      printargs,              "SYS_352"               }, /* 
352 */
+       { 6,    0,      printargs,              "process_vm_readv"      }, /* 
351 */
+       { 6,    0,      printargs,              "process_vm_writev"     }, /* 
352 */
        { 5,    0,      printargs,              "SYS_353"               }, /* 
353 */
        { 5,    0,      printargs,              "SYS_354"               }, /* 
354 */
        { 5,    0,      printargs,              "SYS_355"               }, /* 
355 */
diff -d -urpN strace.5/linux/x86_64/syscallent.h 
strace.6/linux/x86_64/syscallent.h
--- strace.5/linux/x86_64/syscallent.h  2012-01-04 15:09:05.000000000 +0100
+++ strace.6/linux/x86_64/syscallent.h  2012-01-20 12:01:46.824097896 +0100
@@ -308,3 +308,5 @@
        { 4,    TN,     sys_sendmmsg,           "sendmmsg"      }, /* 307 */
        { 2,    TD,     sys_setns,              "setns"         }, /* 308 */
        { 3,    0,      sys_getcpu,             "getcpu"        }, /* 309 */
+       { 6,    0,      printargs,              "process_vm_readv"      }, /* 
310 */
+       { 6,    0,      printargs,              "process_vm_writev"     }, /* 
311 */
diff -d -urpN strace.5/util.c strace.6/util.c
--- strace.5/util.c     2012-01-20 11:55:31.000000000 +0100
+++ strace.6/util.c     2012-01-20 12:08:23.222153067 +0100
@@ -769,6 +769,35 @@ dumpstr(struct tcb *tcp, long addr, int
        }
  }

+
+
+#if !defined(__NR_process_vm_readv)
+# if defined(I386)
+#  define __NR_process_vm_readv  347
+#  define __NR_process_vm_writev 348
+# elif defined(X86_64)
+#  define __NR_process_vm_readv  310
+#  define __NR_process_vm_writev 311
+# endif
+#endif
+
+#if defined(__NR_process_vm_readv)
+static bool process_vm_readv_not_supported = 0;
+static ssize_t process_vm_readv(pid_t pid,
+                const struct iovec *lvec,
+                unsigned long liovcnt,
+                const struct iovec *rvec,
+                unsigned long riovcnt,
+                unsigned long flags)
+{
+       return syscall(__NR_process_vm_readv, (long)pid, lvec, liovcnt, rvec, 
riovcnt, flags);
+}
+#else
+static bool process_vm_readv_not_supported = 1;
+# define process_vm_readv(...) (errno = ENOSYS, -1)
+#endif
+
+
  #define PAGMASK       (~(PAGSIZ - 1))
  /*
   * move `len' bytes of data from process `pid'
@@ -786,6 +815,29 @@ umoven(struct tcb *tcp, long addr, int l
                char x[sizeof(long)];
        } u;

+       if (!process_vm_readv_not_supported) {
+               struct iovec local[1], remote[1];
+               int r;
+
+               local[0].iov_base = laddr;
+               remote[0].iov_base = (void*)addr;
+               local[0].iov_len = remote[0].iov_len = len;
+               r = process_vm_readv(pid,
+                               local, 1,
+                               remote, 1,
+                               /*flags:*/ 0
+               );
+               if (r < 0) {
+                       if (errno == ENOSYS) {
+                               process_vm_readv_not_supported = 1;
+                               goto vm_readv_not_supported;
+                       }
+                       perror("process_vm_readv");
+               }
+               return r;
+       }
+ vm_readv_not_supported:
+
  #if SUPPORTED_PERSONALITIES > 1
        if (personality_wordsize[current_personality] < sizeof(addr))
                addr &= (1ul << 8 * personality_wordsize[current_personality]) 
- 1;
@@ -921,6 +973,55 @@ umovestr(struct tcb *tcp, long addr, int
                addr &= (1ul << 8 * personality_wordsize[current_personality]) 
- 1;
  #endif

+       if (!process_vm_readv_not_supported) {
+               struct iovec local[1], remote[1];
+
+               local[0].iov_base = laddr;
+               remote[0].iov_base = (void*)addr;
+
+               while (len > 0) {
+                       int end_in_page;
+                       int r;
+                       int chunk_len;
+
+                       /* Don't read kilobytes: most strings are short */
+                       chunk_len = len;
+                       if (chunk_len > 256)
+                               chunk_len = 256;
+                       /* Don't cross pages. I guess otherwise we can get 
EFAULT
+                        * and fail to notice that terminating NUL lies
+                        * in the existing (first) page.
+                        * (I hope there aren't arches with pages < 4K)
+                        */
+                       end_in_page = ((addr + chunk_len) & 4095);
+                       r = chunk_len - end_in_page;
+                       if (r > 0) /* if chunk_len > end_in_page */
+                               chunk_len = r; /* chunk_len -= end_in_page */
+
+                       local[0].iov_len = remote[0].iov_len = chunk_len;
+                       r = process_vm_readv(pid,
+                                       local, 1,
+                                       remote, 1,
+                                       /*flags:*/ 0
+                       );
+                       if (r < 0) {
+                               if (errno == ENOSYS) {
+                                       process_vm_readv_not_supported = 1;
+                                       goto vm_readv_not_supported;
+                               }
+                               perror("process_vm_readv");
+                               return -1;
+                       }
+                       if (memchr(local[0].iov_base, 0, r))
+                               return 1;
+                       local[0].iov_base += r;
+                       remote[0].iov_base += r;
+                       len -= r;
+               }
+               return 0;
+       }
+ vm_readv_not_supported:
+
        if (addr & (sizeof(long) - 1)) {
                /* addr not a multiple of sizeof(long) */
                n = addr - (addr & -sizeof(long)); /* residue */

------------------------------------------------------------------------------
Keep Your Developer Skills Current with LearnDevNow!
The most comprehensive online learning library for Microsoft developers
is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3,
Metro Style Apps, more. Free future releases when you subscribe now!
http://p.sf.net/sfu/learndevnow-d2d
_______________________________________________
Strace-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/strace-devel

Reply via email to