Control: retitle -1 ltrace: [p]{read,write}[v]() handling, common *64() 
functions in modern glibc, [fl]seek[o]()/ftell[o](); requisite [u]llong; 20x 
speed optimisation in default config

okay that was better but still too slow to debug tail without annoying me

~20x faster wall-time in default config  (baseline 8.3s ),
 ~7x                  with -s9999999     (         17.4s),
 ~2x                  with -{A,s}9999999 (         39.6s).

Probably more for sane usecases but this program consists exclusively of
read(64k) writev(1024 iovs, each 7 bytes).

Please consider this, it gets rid of ltrace's legendary slowness for me.

Best,
наб
From: =?utf-8?b?0L3QsNCx?= <nabijaczlew...@nabijaczleweli.xyz>
Date: Wed, 26 Jul 2023 19:12:16 +0200
Subject: Read larger-than-word buffers with process_vm_readv. Pre-buffer
 arrays of primitives

The first optimisation is obvious: at one point, without it,
ltrace does ptrace(PTRACE_PEEKDATA) /79959 consecutive times/
(thus reading 640kB; imagine how much slower it'd be on
 i686 with double the syscall count and slower syscalls).

The second is necessary because arrays are reified
by reifying each element separately.
Thus, a 64kB string will cause ltrace to PTRACE_PEEKDATA 65`536 times.)

Instead, cooperatively notify the reification module before formatting
an array, and read (up to a megabyte) into a static buffer.
This means that the no-flag and -s9999999999 runs complete without ever
PTRACE_PEEKDATAing a singular character.

The zero() lens would (needlessly) check more than the required
mapping size, which is limited by -A and -s: restrict it to never
try more than the maximum of those + 1, which allows us to speculatively
pre-map the entire prospective NUL-terminated string too.

Evaluation:
The program consists almost exclusively of pread64(64kB)
of seven-byte-lines and writev(1024)s (where each iov is one line)
of the whole read buffer. The input file is 6.6M, of seq 1000000.

$ time out/cmd/tail -r /tmp/1000000 > /dev/null

real    0m0.030s
user    0m0.017s
sys     0m0.013s

165kB of output:
$ time ltrace -o /dev/null -F /etc/ltrace.conf  out/cmd/tail -r /tmp/1000000 > /dev/null

real    0m8.508s
user    0m2.081s
sys     0m6.389s
$ time ~/backports/ltrace/ltrace -o /dev/null -F /etc/ltrace.conf  out/cmd/tail -r /tmp/1000000 > /dev/null

real    0m0.464s
user    0m0.173s
sys     0m0.282s

7.7MB of output:
$ time ltrace -o /dev/null -F /etc/ltrace.conf -s9999999999  out/cmd/tail -r /tmp/1000000 > /dev/null

real    0m17.820s
user    0m5.384s
sys     0m12.395s
$ time ~/backports/ltrace/ltrace -o /dev/null -F /etc/ltrace.conf -s9999999999  out/cmd/tail -r /tmp/1000000 > /dev/null

real    0m2.678s
user    0m2.377s
sys     0m0.265s

24M of output (especially torturous for the reasons mentioned above):
$ time ltrace -o /dev/null -F /etc/ltrace.conf -{A,s}9999999999  out/cmd/tail -r /tmp/1000000 > /dev/null

real    0m37.834s
user    0m12.685s
sys     0m25.027s
$ time ~/backports/ltrace/ltrace -o /dev/null -F /etc/ltrace.conf -{A,s}9999999999  out/cmd/tail -r /tmp/1000000 > /dev/null

real    0m19.512s
user    0m8.392s
sys     0m10.997s
---
 expr.c                    |  6 ++++++
 expr.h                    |  3 +++
 lens_default.c            | 10 +++++++++-
 sysdeps/linux-gnu/trace.c | 29 ++++++++++++++++++++++-------
 value.c                   | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 value.h                   |  8 ++++++++
 zero.c                    |  7 +++++++
 7 files changed, 102 insertions(+), 8 deletions(-)

diff --git a/expr.c b/expr.c
index 01ce4c6..e0215d5 100644
--- a/expr.c
+++ b/expr.c
@@ -178,6 +178,12 @@ expr_is_compile_constant(struct expr_node *node)
 	return node->kind == EXPR_OP_CONST;
 }
 
+int
+expr_is_trivial(struct expr_node *node)
+{
+	return !(node->kind == EXPR_OP_CALL1 || node->kind == EXPR_OP_CALL2);
+}
+
 static int
 eval_up(struct expr_node *node, struct value *context,
 	struct value_dict *arguments, struct value *ret_value)
diff --git a/expr.h b/expr.h
index aa4b0b6..dfd6789 100644
--- a/expr.h
+++ b/expr.h
@@ -150,6 +150,9 @@ int expr_eval_word(struct expr_node *node, struct value *context,
  * things like sizeof or simple expressions might be allowed.  */
 int expr_is_compile_constant(struct expr_node *node);
 
+/* Nonzero if evalutaion won't cause side effects (i.e. not EXPR_OP_CALL*). */
+int expr_is_trivial(struct expr_node *node);
+
 /* Returns a pre-computed expression "self".  */
 struct expr_node *expr_self(void);
 
diff --git a/lens_default.c b/lens_default.c
index 71d9584..767465f 100644
--- a/lens_default.c
+++ b/lens_default.c
@@ -342,14 +342,22 @@ format_array(FILE *stream, struct value *value, struct value_dict *arguments,
 	     struct expr_node *length, size_t maxlen, int before,
 	     const char *open, const char *close, const char *delim)
 {
-	/* We need "long" to be long enough to cover the whole address
+	/* We need "long long" to be long enough to cover the whole address
 	 * space.  */
 	(void)sizeof(char[1 - 2*(sizeof(long long) < sizeof(void *))]);
+
+	int preloaded __attribute__((__cleanup__(value_preload_for_array_flush))) = -1;
+	if(!expr_is_trivial(length))  // zero()
+		preloaded = value_preload_for_array(value, maxlen + 1);
+
 	long long l;
 	if (expr_eval_word(length, value, arguments, &l) < 0)
 		return -1;
 	size_t len = (size_t)l;
 
+	if(preloaded == -1)
+		preloaded = value_preload_for_array(value, len > maxlen ? maxlen : len);
+
 	int written = 0;
 	if (acc_fprintf(&written, stream, "%s", open) < 0)
 		return -1;
diff --git a/sysdeps/linux-gnu/trace.c b/sysdeps/linux-gnu/trace.c
index e13b761..8fb0fcd 100644
--- a/sysdeps/linux-gnu/trace.c
+++ b/sysdeps/linux-gnu/trace.c
@@ -21,6 +21,7 @@
  * 02110-1301 USA
  */
 
+#define _GNU_SOURCE
 #include "config.h"
 
 #include <asm/unistd.h>
@@ -31,6 +32,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/uio.h>
 #include <unistd.h>
 
 #ifdef HAVE_LIBSELINUX
@@ -1183,8 +1185,21 @@ umovebytes(Process *proc, void *addr, void *laddr, size_t len) {
 	int started = 0;
 	size_t offset = 0, bytes_read = 0;
 
+
+	if(len > sizeof(long)) {
+		struct iovec local = {.iov_base = laddr, .iov_len = len};
+		struct iovec remote = {.iov_base = addr, .iov_len = len};
+		ssize_t rd = process_vm_readv(proc->pid, &local, 1, &remote, 1, 0);
+		if(rd != -1) {
+			started = 1;
+			offset = bytes_read = rd;
+		}
+	}
+
 	while (offset < len) {
-		a.a = ptrace(PTRACE_PEEKTEXT, proc->pid, addr + offset, 0);
+		int misalignment = (uintptr_t)(addr + offset) % sizeof(long);
+
+		a.a = ptrace(PTRACE_PEEKTEXT, proc->pid, addr + offset - misalignment, 0);
 		if (a.a == -1 && errno) {
 			if (started && errno == EIO)
 				return bytes_read;
@@ -1193,15 +1208,15 @@ umovebytes(Process *proc, void *addr, void *laddr, size_t len) {
 		}
 		started = 1;
 
-		if (len - offset >= sizeof(long)) {
-			memcpy(laddr + offset, &a.c[0], sizeof(long));
-			bytes_read += sizeof(long);
+		if (len - offset >= sizeof(long) - misalignment) {
+			memcpy(laddr + offset + misalignment, &a.c[0] + misalignment, sizeof(long) - misalignment);
+			bytes_read += sizeof(long) - misalignment;
 		}
 		else {
-			memcpy(laddr + offset, &a.c[0], len - offset);
-			bytes_read += (len - offset);
+			memcpy(laddr + offset + misalignment, &a.c[0] + misalignment, len - offset - misalignment);
+			bytes_read += (len - offset) - misalignment;
 		}
-		offset += sizeof(long);
+		offset += sizeof(long) - misalignment;
 	}
 
 	return bytes_read;
diff --git a/value.c b/value.c
index 3209b9f..1edb0e4 100644
--- a/value.c
+++ b/value.c
@@ -122,6 +122,47 @@ value_in_inferior(struct value *valp, arch_addr_t address)
 	valp->u.address = address;
 }
 
+static char preload_buf[1024 * 1024];
+static arch_addr_t preload_buf_start, preload_buf_end;
+int
+value_preload_for_array(struct value *val, size_t len)
+{
+	if (preload_buf_start || val->where != VAL_LOC_INFERIOR)
+		return 0;
+
+	struct arg_type_info *e_info = type_element(val->type, 0);
+	if (e_info == NULL)
+		return 0;
+	// We only have one preload: use it to map in arrays of primitives (strings. it's strings.)
+	switch(e_info->type) {
+		case ARGTYPE_VOID:
+		case ARGTYPE_ARRAY:
+		case ARGTYPE_STRUCT:
+			return 0;
+		default:
+			break;
+	}
+
+	size_t el_sz = type_sizeof(val->inferior, e_info);
+	if (el_sz == (size_t)-1)
+		return 0;
+
+	if (__builtin_mul_overflow(len, el_sz, &len) || len > sizeof(preload_buf))
+		len = sizeof(preload_buf);
+	size_t rd = umovebytes(val->inferior, val->u.inf_address, preload_buf, len);
+	if(rd == (size_t)-1)
+		return 0;
+	preload_buf_start = val->u.inf_address;
+	preload_buf_end = val->u.inf_address + rd;
+	return 1;
+}
+void
+value_preload_for_array_flush(int *preloaded)
+{
+	if (*preloaded && *preloaded != -1)
+		preload_buf_start = preload_buf_end = 0;
+}
+
 int
 value_reify(struct value *val, struct value_dict *arguments)
 {
@@ -145,12 +186,18 @@ value_reify(struct value *val, struct value_dict *arguments)
 		nloc = VAL_LOC_COPY;
 	}
 
+	if (val->u.inf_address >= preload_buf_start && val->u.inf_address + size <= preload_buf_end) {
+		memcpy(data, preload_buf + (val->u.inf_address - preload_buf_start), size);
+		goto ok;
+	}
+
 	if (umovebytes(val->inferior, val->u.inf_address, data, size) < size) {
 		if (nloc == VAL_LOC_COPY)
 			free(data);
 		return -1;
 	}
 
+ok:
 	val->where = nloc;
 	if (nloc == VAL_LOC_COPY)
 		val->u.address = data;
diff --git a/value.h b/value.h
index a527667..e6edabd 100644
--- a/value.h
+++ b/value.h
@@ -113,6 +113,14 @@ int value_init_element(struct value *ret_val, struct value *valp, size_t element
  * RET_VAL.  Returns 0 on success, or negative value on failure.  */
 int value_init_deref(struct value *ret_val, struct value *valp);
 
+/* Pre-read the contents of an array of max length len from the tracee.
+ * Optional optimisation. */
+int value_preload_for_array(struct value *val, size_t len);
+
+/* Must point to return value from value_preload_for_array() or -1.
+ * Suitable for use as __attribute__((__cleanup__(value_preload_for_array_flush))). */
+void value_preload_for_array_flush(int *preloaded);
+
 /* If value is in inferior, copy it over to ltrace.  Return 0 for
  * success or negative value for failure.  */
 int value_reify(struct value *val, struct value_dict *arguments);
diff --git a/zero.c b/zero.c
index ddb5e0d..a249086 100644
--- a/zero.c
+++ b/zero.c
@@ -32,6 +32,13 @@ zero_callback_max(struct value *ret_value, struct value *lhs,
 		  struct value_dict *arguments,
 		  size_t max, void *data)
 {
+	static size_t maxlen;
+	if(!maxlen) {
+		maxlen = options.strlen > options.arraylen ? options.strlen : options.arraylen;
+		if(maxlen != (size_t)-1)
+			++maxlen;
+	}
+	max = max < maxlen ? max : maxlen;
 	size_t i;
 	for (i = 0; i < max; ++i) {
 		struct value element;

Attachment: signature.asc
Description: PGP signature

Reply via email to