Control: retitle -1 ltrace: [p]{read,write}[v]() handling, common *64() functions in modern glibc, [fl]seek[o]()/ftell[o](); requisite [u]llong; 20x speed optimisation in default config
okay that was better but still too slow to debug tail without annoying me ~20x faster wall-time in default config (baseline 8.3s ), ~7x with -s9999999 ( 17.4s), ~2x with -{A,s}9999999 ( 39.6s). Probably more for sane usecases but this program consists exclusively of read(64k) writev(1024 iovs, each 7 bytes). Please consider this, it gets rid of ltrace's legendary slowness for me. Best, наб
From: =?utf-8?b?0L3QsNCx?= <nabijaczlew...@nabijaczleweli.xyz> Date: Wed, 26 Jul 2023 19:12:16 +0200 Subject: Read larger-than-word buffers with process_vm_readv. Pre-buffer arrays of primitives The first optimisation is obvious: at one point, without it, ltrace does ptrace(PTRACE_PEEKDATA) /79959 consecutive times/ (thus reading 640kB; imagine how much slower it'd be on i686 with double the syscall count and slower syscalls). The second is necessary because arrays are reified by reifying each element separately. Thus, a 64kB string will cause ltrace to PTRACE_PEEKDATA 65`536 times.) Instead, cooperatively notify the reification module before formatting an array, and read (up to a megabyte) into a static buffer. This means that the no-flag and -s9999999999 runs complete without ever PTRACE_PEEKDATAing a singular character. The zero() lens would (needlessly) check more than the required mapping size, which is limited by -A and -s: restrict it to never try more than the maximum of those + 1, which allows us to speculatively pre-map the entire prospective NUL-terminated string too. Evaluation: The program consists almost exclusively of pread64(64kB) of seven-byte-lines and writev(1024)s (where each iov is one line) of the whole read buffer. The input file is 6.6M, of seq 1000000. $ time out/cmd/tail -r /tmp/1000000 > /dev/null real 0m0.030s user 0m0.017s sys 0m0.013s 165kB of output: $ time ltrace -o /dev/null -F /etc/ltrace.conf out/cmd/tail -r /tmp/1000000 > /dev/null real 0m8.508s user 0m2.081s sys 0m6.389s $ time ~/backports/ltrace/ltrace -o /dev/null -F /etc/ltrace.conf out/cmd/tail -r /tmp/1000000 > /dev/null real 0m0.464s user 0m0.173s sys 0m0.282s 7.7MB of output: $ time ltrace -o /dev/null -F /etc/ltrace.conf -s9999999999 out/cmd/tail -r /tmp/1000000 > /dev/null real 0m17.820s user 0m5.384s sys 0m12.395s $ time ~/backports/ltrace/ltrace -o /dev/null -F /etc/ltrace.conf -s9999999999 out/cmd/tail -r /tmp/1000000 > /dev/null real 0m2.678s user 0m2.377s sys 0m0.265s 24M of output (especially torturous for the reasons mentioned above): $ time ltrace -o /dev/null -F /etc/ltrace.conf -{A,s}9999999999 out/cmd/tail -r /tmp/1000000 > /dev/null real 0m37.834s user 0m12.685s sys 0m25.027s $ time ~/backports/ltrace/ltrace -o /dev/null -F /etc/ltrace.conf -{A,s}9999999999 out/cmd/tail -r /tmp/1000000 > /dev/null real 0m19.512s user 0m8.392s sys 0m10.997s --- expr.c | 6 ++++++ expr.h | 3 +++ lens_default.c | 10 +++++++++- sysdeps/linux-gnu/trace.c | 29 ++++++++++++++++++++++------- value.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ value.h | 8 ++++++++ zero.c | 7 +++++++ 7 files changed, 102 insertions(+), 8 deletions(-) diff --git a/expr.c b/expr.c index 01ce4c6..e0215d5 100644 --- a/expr.c +++ b/expr.c @@ -178,6 +178,12 @@ expr_is_compile_constant(struct expr_node *node) return node->kind == EXPR_OP_CONST; } +int +expr_is_trivial(struct expr_node *node) +{ + return !(node->kind == EXPR_OP_CALL1 || node->kind == EXPR_OP_CALL2); +} + static int eval_up(struct expr_node *node, struct value *context, struct value_dict *arguments, struct value *ret_value) diff --git a/expr.h b/expr.h index aa4b0b6..dfd6789 100644 --- a/expr.h +++ b/expr.h @@ -150,6 +150,9 @@ int expr_eval_word(struct expr_node *node, struct value *context, * things like sizeof or simple expressions might be allowed. */ int expr_is_compile_constant(struct expr_node *node); +/* Nonzero if evalutaion won't cause side effects (i.e. not EXPR_OP_CALL*). */ +int expr_is_trivial(struct expr_node *node); + /* Returns a pre-computed expression "self". */ struct expr_node *expr_self(void); diff --git a/lens_default.c b/lens_default.c index 71d9584..767465f 100644 --- a/lens_default.c +++ b/lens_default.c @@ -342,14 +342,22 @@ format_array(FILE *stream, struct value *value, struct value_dict *arguments, struct expr_node *length, size_t maxlen, int before, const char *open, const char *close, const char *delim) { - /* We need "long" to be long enough to cover the whole address + /* We need "long long" to be long enough to cover the whole address * space. */ (void)sizeof(char[1 - 2*(sizeof(long long) < sizeof(void *))]); + + int preloaded __attribute__((__cleanup__(value_preload_for_array_flush))) = -1; + if(!expr_is_trivial(length)) // zero() + preloaded = value_preload_for_array(value, maxlen + 1); + long long l; if (expr_eval_word(length, value, arguments, &l) < 0) return -1; size_t len = (size_t)l; + if(preloaded == -1) + preloaded = value_preload_for_array(value, len > maxlen ? maxlen : len); + int written = 0; if (acc_fprintf(&written, stream, "%s", open) < 0) return -1; diff --git a/sysdeps/linux-gnu/trace.c b/sysdeps/linux-gnu/trace.c index e13b761..8fb0fcd 100644 --- a/sysdeps/linux-gnu/trace.c +++ b/sysdeps/linux-gnu/trace.c @@ -21,6 +21,7 @@ * 02110-1301 USA */ +#define _GNU_SOURCE #include "config.h" #include <asm/unistd.h> @@ -31,6 +32,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <sys/uio.h> #include <unistd.h> #ifdef HAVE_LIBSELINUX @@ -1183,8 +1185,21 @@ umovebytes(Process *proc, void *addr, void *laddr, size_t len) { int started = 0; size_t offset = 0, bytes_read = 0; + + if(len > sizeof(long)) { + struct iovec local = {.iov_base = laddr, .iov_len = len}; + struct iovec remote = {.iov_base = addr, .iov_len = len}; + ssize_t rd = process_vm_readv(proc->pid, &local, 1, &remote, 1, 0); + if(rd != -1) { + started = 1; + offset = bytes_read = rd; + } + } + while (offset < len) { - a.a = ptrace(PTRACE_PEEKTEXT, proc->pid, addr + offset, 0); + int misalignment = (uintptr_t)(addr + offset) % sizeof(long); + + a.a = ptrace(PTRACE_PEEKTEXT, proc->pid, addr + offset - misalignment, 0); if (a.a == -1 && errno) { if (started && errno == EIO) return bytes_read; @@ -1193,15 +1208,15 @@ umovebytes(Process *proc, void *addr, void *laddr, size_t len) { } started = 1; - if (len - offset >= sizeof(long)) { - memcpy(laddr + offset, &a.c[0], sizeof(long)); - bytes_read += sizeof(long); + if (len - offset >= sizeof(long) - misalignment) { + memcpy(laddr + offset + misalignment, &a.c[0] + misalignment, sizeof(long) - misalignment); + bytes_read += sizeof(long) - misalignment; } else { - memcpy(laddr + offset, &a.c[0], len - offset); - bytes_read += (len - offset); + memcpy(laddr + offset + misalignment, &a.c[0] + misalignment, len - offset - misalignment); + bytes_read += (len - offset) - misalignment; } - offset += sizeof(long); + offset += sizeof(long) - misalignment; } return bytes_read; diff --git a/value.c b/value.c index 3209b9f..1edb0e4 100644 --- a/value.c +++ b/value.c @@ -122,6 +122,47 @@ value_in_inferior(struct value *valp, arch_addr_t address) valp->u.address = address; } +static char preload_buf[1024 * 1024]; +static arch_addr_t preload_buf_start, preload_buf_end; +int +value_preload_for_array(struct value *val, size_t len) +{ + if (preload_buf_start || val->where != VAL_LOC_INFERIOR) + return 0; + + struct arg_type_info *e_info = type_element(val->type, 0); + if (e_info == NULL) + return 0; + // We only have one preload: use it to map in arrays of primitives (strings. it's strings.) + switch(e_info->type) { + case ARGTYPE_VOID: + case ARGTYPE_ARRAY: + case ARGTYPE_STRUCT: + return 0; + default: + break; + } + + size_t el_sz = type_sizeof(val->inferior, e_info); + if (el_sz == (size_t)-1) + return 0; + + if (__builtin_mul_overflow(len, el_sz, &len) || len > sizeof(preload_buf)) + len = sizeof(preload_buf); + size_t rd = umovebytes(val->inferior, val->u.inf_address, preload_buf, len); + if(rd == (size_t)-1) + return 0; + preload_buf_start = val->u.inf_address; + preload_buf_end = val->u.inf_address + rd; + return 1; +} +void +value_preload_for_array_flush(int *preloaded) +{ + if (*preloaded && *preloaded != -1) + preload_buf_start = preload_buf_end = 0; +} + int value_reify(struct value *val, struct value_dict *arguments) { @@ -145,12 +186,18 @@ value_reify(struct value *val, struct value_dict *arguments) nloc = VAL_LOC_COPY; } + if (val->u.inf_address >= preload_buf_start && val->u.inf_address + size <= preload_buf_end) { + memcpy(data, preload_buf + (val->u.inf_address - preload_buf_start), size); + goto ok; + } + if (umovebytes(val->inferior, val->u.inf_address, data, size) < size) { if (nloc == VAL_LOC_COPY) free(data); return -1; } +ok: val->where = nloc; if (nloc == VAL_LOC_COPY) val->u.address = data; diff --git a/value.h b/value.h index a527667..e6edabd 100644 --- a/value.h +++ b/value.h @@ -113,6 +113,14 @@ int value_init_element(struct value *ret_val, struct value *valp, size_t element * RET_VAL. Returns 0 on success, or negative value on failure. */ int value_init_deref(struct value *ret_val, struct value *valp); +/* Pre-read the contents of an array of max length len from the tracee. + * Optional optimisation. */ +int value_preload_for_array(struct value *val, size_t len); + +/* Must point to return value from value_preload_for_array() or -1. + * Suitable for use as __attribute__((__cleanup__(value_preload_for_array_flush))). */ +void value_preload_for_array_flush(int *preloaded); + /* If value is in inferior, copy it over to ltrace. Return 0 for * success or negative value for failure. */ int value_reify(struct value *val, struct value_dict *arguments); diff --git a/zero.c b/zero.c index ddb5e0d..a249086 100644 --- a/zero.c +++ b/zero.c @@ -32,6 +32,13 @@ zero_callback_max(struct value *ret_value, struct value *lhs, struct value_dict *arguments, size_t max, void *data) { + static size_t maxlen; + if(!maxlen) { + maxlen = options.strlen > options.arraylen ? options.strlen : options.arraylen; + if(maxlen != (size_t)-1) + ++maxlen; + } + max = max < maxlen ? max : maxlen; size_t i; for (i = 0; i < max; ++i) { struct value element;
signature.asc
Description: PGP signature