from:"Ilya Leoshkevich"

[PATCH 3/4] linux-user/sparc: Handle "ta 5"

2023-01-31 Thread Ilya Leoshkevich

GCC lowers __builtin_trap() to "ta 5", which in turn generates trap
0x105. Follow what kernel's bad_trap() is doing there.

Signed-off-by: Ilya Leoshkevich 
---
 linux-user/sparc/cpu_loop.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/linux-user/sparc/cpu_loop.c b/linux-user/sparc/cpu_loop.c
index 434c90a55f8..fa36d452a51 100644
--- a/linux-user/sparc/cpu_loop.c
+++ b/linux-user/sparc/cpu_loop.c
@@ -225,6 +225,9 @@ void cpu_loop (CPUSPARCState *env)
 restore_window(env);
 break;
 #ifndef TARGET_ABI32
+case 0x105:
+force_sig_fault(TARGET_SIGILL, ILL_ILLTRP, env->pc);
+break;
 case 0x16e:
 flush_windows(env);
 sparc64_get_context(env);
-- 
2.39.1

[PATCH 2/4] linux-user/microblaze: Handle privileged exception

2023-01-31 Thread Ilya Leoshkevich

Follow what kernel's full_exception() is doing.

Signed-off-by: Ilya Leoshkevich 
---
 linux-user/microblaze/cpu_loop.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/linux-user/microblaze/cpu_loop.c b/linux-user/microblaze/cpu_loop.c
index 5ccf9e942ea..212e62d0a62 100644
--- a/linux-user/microblaze/cpu_loop.c
+++ b/linux-user/microblaze/cpu_loop.c
@@ -25,8 +25,8 @@
 
 void cpu_loop(CPUMBState *env)
 {
+int trapnr, ret, si_code, sig;
 CPUState *cs = env_cpu(env);
-int trapnr, ret, si_code;
 
 while (1) {
 cpu_exec_start(cs);
@@ -76,6 +76,7 @@ void cpu_loop(CPUMBState *env)
 env->iflags &= ~(IMM_FLAG | D_FLAG);
 switch (env->esr & 31) {
 case ESR_EC_DIVZERO:
+sig = TARGET_SIGFPE;
 si_code = TARGET_FPE_INTDIV;
 break;
 case ESR_EC_FPU:
@@ -84,6 +85,7 @@ void cpu_loop(CPUMBState *env)
  * if there's no recognized bit set.  Possibly this
  * implies that si_code is 0, but follow the structure.
  */
+sig = TARGET_SIGFPE;
 si_code = env->fsr;
 if (si_code & FSR_IO) {
 si_code = TARGET_FPE_FLTINV;
@@ -97,13 +99,17 @@ void cpu_loop(CPUMBState *env)
 si_code = TARGET_FPE_FLTRES;
 }
 break;
+case ESR_EC_PRIVINSN:
+sig = SIGILL;
+si_code = ILL_PRVOPC;
+break;
 default:
 fprintf(stderr, "Unhandled hw-exception: 0x%x\n",
 env->esr & ESR_EC_MASK);
 cpu_dump_state(cs, stderr, 0);
 exit(EXIT_FAILURE);
 }
-force_sig_fault(TARGET_SIGFPE, si_code, env->pc);
+force_sig_fault(sig, si_code, env->pc);
 break;
 
 case EXCP_DEBUG:
-- 
2.39.1

[PATCH 0/4] Fix deadlock when dying because of a signal

2023-01-31 Thread Ilya Leoshkevich

Hi,

wasmtime testsuite found a deadlock in qemu_plugin_user_exit().
I tracked it down to one of my earlier patches, which introduced
cleanup in dump_core_and_abort().

Patch 1 fixes the issue, patches 2 and 3 fix __builtin_trap()
handling in microblaze and sparc - which is needed for patch 4,
that adds a test.

Just before sending this, I noticed that a solution has already been
proposed in [1], but apparently it wasn't accepted.

Best regards,
Ilya

[1] https://lists.gnu.org/archive/html/qemu-devel/2023-01/msg03506.html

Ilya Leoshkevich (4):
  cpus: Make {start,end}_exclusive() recursive
  linux-user/microblaze: Handle privileged exception
  linux-user/sparc: Handle "ta 5"
  tests/tcg/linux-test: Add linux-fork-trap test

 cpus-common.c   | 12 +-
 include/hw/core/cpu.h   |  4 +-
 linux-user/microblaze/cpu_loop.c| 10 -
 linux-user/sparc/cpu_loop.c |  3 ++
 tests/tcg/multiarch/linux/linux-fork-trap.c | 48 +
 5 files changed, 71 insertions(+), 6 deletions(-)
 create mode 100644 tests/tcg/multiarch/linux/linux-fork-trap.c

-- 
2.39.1

[PATCH 4/4] tests/tcg/linux-test: Add linux-fork-trap test

2023-01-31 Thread Ilya Leoshkevich

Check that dying due to a signal does not deadlock.

Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/multiarch/linux/linux-fork-trap.c | 48 +
 1 file changed, 48 insertions(+)
 create mode 100644 tests/tcg/multiarch/linux/linux-fork-trap.c

diff --git a/tests/tcg/multiarch/linux/linux-fork-trap.c 
b/tests/tcg/multiarch/linux/linux-fork-trap.c
new file mode 100644
index 000..a921f875380
--- /dev/null
+++ b/tests/tcg/multiarch/linux/linux-fork-trap.c
@@ -0,0 +1,48 @@
+/*
+ * Test that a fork()ed process terminates after __builtin_trap().
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+
+int main(void)
+{
+struct rlimit nodump;
+pid_t err, pid;
+int wstatus;
+
+pid = fork();
+assert(pid != -1);
+if (pid == 0) {
+/* We are about to crash on purpose; disable core dumps. */
+if (getrlimit(RLIMIT_CORE, )) {
+return EXIT_FAILURE;
+}
+nodump.rlim_cur = 0;
+if (setrlimit(RLIMIT_CORE, )) {
+return EXIT_FAILURE;
+}
+/*
+ * An alternative would be to dereference a NULL pointer, but that
+ * would be an UB in C.
+ */
+#if defined(__MICROBLAZE__)
+/*
+ * gcc emits "bri 0", which is an endless loop.
+ * Take glibc's ABORT_INSTRUCTION.
+ */
+asm volatile("brki r0,-1");
+#else
+__builtin_trap();
+#endif
+}
+err = waitpid(pid, , 0);
+assert(err == pid);
+assert(WIFSIGNALED(wstatus));
+
+return EXIT_SUCCESS;
+}
-- 
2.39.1

[PATCH] tests/tcg/s390x: Use -nostdlib for softmmu tests

2023-01-31 Thread Ilya Leoshkevich

The code currently uses -nostartfiles, but this does not prevent
linking with libc. On Fedora there is no cross-libc, so the linking
step fails.

Fix by using the more comprehensive -nostdlib (that's also what
probe_target_compiler() checks for as well).

Fixes: 503e549e441e ("tests/tcg/s390x: Test unaligned accesses to lowcore")
Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/s390x/Makefile.softmmu-target | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tcg/s390x/Makefile.softmmu-target 
b/tests/tcg/s390x/Makefile.softmmu-target
index a34fa68473e..50c1b88065d 100644
--- a/tests/tcg/s390x/Makefile.softmmu-target
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -3,7 +3,7 @@ VPATH+=$(S390X_SRC)
 QEMU_OPTS=-action panic=exit-failure -kernel
 
 %: %.S
-   $(CC) -march=z13 -m64 -nostartfiles -static -Wl,-Ttext=0 \
+   $(CC) -march=z13 -m64 -nostdlib -static -Wl,-Ttext=0 \
-Wl,--build-id=none $< -o $@
 
 TESTS += unaligned-lowcore
-- 
2.39.1

[PATCH] meson: Add missing libdw knobs

2023-01-30 Thread Ilya Leoshkevich

Add the missing meson infrastructure bits for the new libdw
dependency. Model them after the existing capstone knobs.

Fixes: 7c10cb38ccb8 ("accel/tcg: Add debuginfo support")
Reported-by: Thomas Huth 
Signed-off-by: Ilya Leoshkevich 
---
 meson.build   | 11 +++
 meson_options.txt |  2 ++
 scripts/meson-buildoptions.sh |  3 +++
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/meson.build b/meson.build
index 6d3b6656297..ade489a3b20 100644
--- a/meson.build
+++ b/meson.build
@@ -1649,10 +1649,13 @@ if libbpf.found() and not cc.links('''
 endif
 
 # libdw
-libdw = dependency('libdw',
-   method: 'pkg-config',
-   kwargs: static_kwargs,
-   required: false)
+libdw = not_found
+if not get_option('libdw').auto() or have_system or have_user
+libdw = dependency('libdw',
+   method: 'pkg-config',
+   kwargs: static_kwargs,
+   required: get_option('libdw'))
+endif
 
 #
 # config-host.h #
diff --git a/meson_options.txt b/meson_options.txt
index 559a571b6b6..082ce437ba5 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -129,6 +129,8 @@ option('gio', type : 'feature', value : 'auto',
description: 'use libgio for D-Bus support')
 option('glusterfs', type : 'feature', value : 'auto',
description: 'Glusterfs block device driver')
+option('libdw', type : 'feature', value : 'auto',
+   description: 'debuginfo support')
 option('libiscsi', type : 'feature', value : 'auto',
description: 'libiscsi userspace initiator')
 option('libnfs', type : 'feature', value : 'auto',
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 0f71e92dcba..e5a8897b639 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -108,6 +108,7 @@ meson_options_help() {
   printf "%s\n" '  kvm KVM acceleration support'
   printf "%s\n" '  l2tpv3  l2tpv3 network backend support'
   printf "%s\n" '  libdaxctl   libdaxctl support'
+  printf "%s\n" '  libdw   debuginfo support'
   printf "%s\n" '  libiscsilibiscsi userspace initiator'
   printf "%s\n" '  libnfs  libnfs block device driver'
   printf "%s\n" '  libpmem libpmem support'
@@ -308,6 +309,8 @@ _meson_option_parse() {
 --disable-l2tpv3) printf "%s" -Dl2tpv3=disabled ;;
 --enable-libdaxctl) printf "%s" -Dlibdaxctl=enabled ;;
 --disable-libdaxctl) printf "%s" -Dlibdaxctl=disabled ;;
+--enable-libdw) printf "%s" -Dlibdw=enabled ;;
+--disable-libdw) printf "%s" -Dlibdw=disabled ;;
 --libdir=*) quote_sh "-Dlibdir=$2" ;;
 --libexecdir=*) quote_sh "-Dlibexecdir=$2" ;;
 --enable-libiscsi) printf "%s" -Dlibiscsi=enabled ;;
-- 
2.39.1

Re: [PATCH v4 2/3] accel/tcg: Add debuginfo support

2023-01-30 Thread Ilya Leoshkevich

On Mon, 2023-01-30 at 15:33 +0100, Thomas Huth wrote:
> On 12/01/2023 16.20, Ilya Leoshkevich wrote:
> > Add libdw-based functions for loading and querying debuginfo. Load
> > debuginfo from the system and the linux-user loaders.
> > 
> > This is useful for the upcoming perf support, which can then put
> > human-readable guest symbols instead of raw guest PCs into perfmap
> > and
> > jitdump files.
> > 
> > Signed-off-by: Ilya Leoshkevich 
> > ---
> ...
> > diff --git a/meson.build b/meson.build
> > index 175517eafde..cab8c67d961 100644
> > --- a/meson.build
> > +++ b/meson.build
> > @@ -1648,6 +1648,12 @@ if libbpf.found() and not cc.links('''
> >     endif
> >   endif
> >   
> > +# libdw
> > +libdw = dependency('libdw',
> > +   method: 'pkg-config',
> > +   kwargs: static_kwargs,
> > +   required: false)
> > +
> 
>   Hi,
> 
> I recently did a build with "configure --without-default-features"
> and I 
> noticed that this new libdw does not get disabled automatically
> there, which 
> looks kind of weird. Is there a reason that there is no config knob
> here to 
> disable it like we have it with all the other optional libraries that
> we 
> support?
> 
>   Thomas

Hi,

this sounds like an omission on my part - I'll have a look at this.

Best regards,
Ilya

Re: [PULL 0/5] tcg patch queue

2023-01-20 Thread Ilya Leoshkevich

On Fri, 2023-01-20 at 10:41 +0100, Thomas Huth wrote:
> On 16/01/2023 23.36, Richard Henderson wrote:
> > The following changes since commit
> > fb7e7990342e59cf67dbd895c1a1e3fb1741df7a:
> > 
> >    tests/qtest/qom-test: Do not print tested properties by default
> > (2023-01-16 15:00:57 +)
> > 
> > are available in the Git repository at:
> > 
> >    https://gitlab.com/rth7680/qemu.git tags/pull-tcg-20230116
> > 
> > for you to fetch changes up to
> > 61710a7e23a63546da0071ea32adb96476fa5d07:
> > 
> >    accel/tcg: Split out cpu_exec_{setjmp,loop} (2023-01-16 10:14:12
> > -1000)
> > 
> > 
> > - Reorg cpu_tb_exec around setjmp.
> > - Use __attribute__((target)) for buffer_is_zero.
> > - Add perfmap and jitdump for perf support.
> > 
> > 
> > Ilya Leoshkevich (3):
> >    linux-user: Clean up when exiting due to a signal
> >    accel/tcg: Add debuginfo support
> >    tcg: add perfmap and jitdump
> > 
> > Richard Henderson (2):
> >    util/bufferiszero: Use __attribute__((target)) for
> > avx2/avx512
> >    accel/tcg: Split out cpu_exec_{setjmp,loop}
> 
>   Hi Richard, hi Ilya,
> 
> with the recent QEMU master branch (commit 701ed34), I'm now seeing
> failures 
> in Travis:
> 
>   https://app.travis-ci.com/github/huth/qemu/jobs/593786529#L14411
> 
> Everything was still fine a couple of days ago (commit fb7e799):
> 
>   https://app.travis-ci.com/github/huth/qemu/builds/259755664
> 
> ... so it seems this is likely related to this pull request. Could
> you 
> please have a look?
> 
>   Thanks,
>    Thomas
> 

I would expect this to be (temporarily) fixed by [1], but we probably
don't set GITLAB_CI in Travis. Would it make sense to set it? It looks
as if this variable is currently used only to skip certain tests.

If not, then maybe split it into QEMU_CI, GITLAB_CI and TRAVIS_CI?

https://lists.gnu.org/archive/html/qemu-devel/2023-01/msg04438.html

[PATCH v4 2/3] accel/tcg: Add debuginfo support

2023-01-12 Thread Ilya Leoshkevich

Add libdw-based functions for loading and querying debuginfo. Load
debuginfo from the system and the linux-user loaders.

This is useful for the upcoming perf support, which can then put
human-readable guest symbols instead of raw guest PCs into perfmap and
jitdump files.

Signed-off-by: Ilya Leoshkevich 
---
 accel/tcg/debuginfo.c  | 96 ++
 accel/tcg/debuginfo.h  | 77 +
 accel/tcg/meson.build  |  1 +
 hw/core/loader.c   |  5 +++
 linux-user/elfload.c   |  3 ++
 linux-user/meson.build |  1 +
 meson.build|  8 
 7 files changed, 191 insertions(+)
 create mode 100644 accel/tcg/debuginfo.c
 create mode 100644 accel/tcg/debuginfo.h

diff --git a/accel/tcg/debuginfo.c b/accel/tcg/debuginfo.c
new file mode 100644
index 000..71c66d04d12
--- /dev/null
+++ b/accel/tcg/debuginfo.c
@@ -0,0 +1,96 @@
+/*
+ * Debug information support.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/lockable.h"
+
+#include 
+
+#include "debuginfo.h"
+
+static QemuMutex lock;
+static Dwfl *dwfl;
+static const Dwfl_Callbacks dwfl_callbacks = {
+.find_elf = NULL,
+.find_debuginfo = dwfl_standard_find_debuginfo,
+.section_address = NULL,
+.debuginfo_path = NULL,
+};
+
+__attribute__((constructor))
+static void debuginfo_init(void)
+{
+qemu_mutex_init();
+}
+
+void debuginfo_report_elf(const char *name, int fd, uint64_t bias)
+{
+QEMU_LOCK_GUARD();
+
+if (dwfl) {
+dwfl_report_begin_add(dwfl);
+} else {
+dwfl = dwfl_begin(_callbacks);
+}
+
+if (dwfl) {
+dwfl_report_elf(dwfl, name, name, fd, bias, true);
+dwfl_report_end(dwfl, NULL, NULL);
+}
+}
+
+void debuginfo_lock(void)
+{
+qemu_mutex_lock();
+}
+
+void debuginfo_query(struct debuginfo_query *q, size_t n)
+{
+const char *symbol, *file;
+Dwfl_Module *dwfl_module;
+Dwfl_Line *dwfl_line;
+GElf_Off dwfl_offset;
+GElf_Sym dwfl_sym;
+size_t i;
+int line;
+
+if (!dwfl) {
+return;
+}
+
+for (i = 0; i < n; i++) {
+dwfl_module = dwfl_addrmodule(dwfl, q[i].address);
+if (!dwfl_module) {
+continue;
+}
+
+if (q[i].flags & DEBUGINFO_SYMBOL) {
+symbol = dwfl_module_addrinfo(dwfl_module, q[i].address,
+  _offset, _sym,
+  NULL, NULL, NULL);
+if (symbol) {
+q[i].symbol = symbol;
+q[i].offset = dwfl_offset;
+}
+}
+
+if (q[i].flags & DEBUGINFO_LINE) {
+dwfl_line = dwfl_module_getsrc(dwfl_module, q[i].address);
+if (dwfl_line) {
+file = dwfl_lineinfo(dwfl_line, NULL, , 0, NULL, NULL);
+if (file) {
+q[i].file = file;
+q[i].line = line;
+}
+}
+}
+}
+}
+
+void debuginfo_unlock(void)
+{
+qemu_mutex_unlock();
+}
diff --git a/accel/tcg/debuginfo.h b/accel/tcg/debuginfo.h
new file mode 100644
index 000..7542cfe6e07
--- /dev/null
+++ b/accel/tcg/debuginfo.h
@@ -0,0 +1,77 @@
+/*
+ * Debug information support.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef ACCEL_TCG_DEBUGINFO_H
+#define ACCEL_TCG_DEBUGINFO_H
+
+/*
+ * Debuginfo describing a certain address.
+ */
+struct debuginfo_query {
+uint64_t address;/* Input: address. */
+int flags;   /* Input: debuginfo subset. */
+const char *symbol;  /* Symbol that the address is part of. */
+uint64_t offset; /* Offset from the symbol. */
+const char *file;/* Source file associated with the address. */
+int line;/* Line number in the source file. */
+};
+
+/*
+ * Debuginfo subsets.
+ */
+#define DEBUGINFO_SYMBOL BIT(1)
+#define DEBUGINFO_LINE   BIT(2)
+
+#if defined(CONFIG_TCG) && defined(CONFIG_LIBDW)
+/*
+ * Load debuginfo for the specified guest ELF image.
+ * Return true on success, false on failure.
+ */
+void debuginfo_report_elf(const char *name, int fd, uint64_t bias);
+
+/*
+ * Take the debuginfo lock.
+ */
+void debuginfo_lock(void);
+
+/*
+ * Fill each on N Qs with the debuginfo about Q->ADDRESS as specified by
+ * Q->FLAGS:
+ *
+ * - DEBUGINFO_SYMBOL: update Q->SYMBOL and Q->OFFSET. If symbol debuginfo is
+ * missing, then leave them as is.
+ * - DEBUINFO_LINE: update Q->FILE and Q->LINE. If line debuginfo is missing,
+ *  then leave them as is.
+ *
+ * This function must be called under the debuginfo lock. The results can be
+ * accessed only until the debuginfo lock is released.
+ */
+void debuginfo_query(struct debuginfo_query *q, size_t n);
+
+/*
+ * Release the debuginfo lock.
+ */
+void debuginfo_unlock(void);
+#else
+static in

[PATCH v4 0/3] tcg: add perfmap and jitdump

2023-01-12 Thread Ilya Leoshkevich

v3:
https://lists.gnu.org/archive/html/qemu-devel/2023-01/msg02072.html

v3 -> v4:
* s/unsigned long long/uint64_t/g (Richard).
* Fix address resolution with TARGET_TB_PCREL again.
  * Open question: do we need something like get_pc_from_opc()?
See FIXME in patch 3.

v2:
https://lists.gnu.org/archive/html/qemu-devel/2022-11/msg02385.html
https://lists.gnu.org/archive/html/qemu-devel/2023-01/msg01026.html

v2 -> v3:
* Enable only for CONFIG_LINUX (Alex).
* Use qemu_get_thread_id() instead of gettid() (Alex).
* Fix CI (Alex).
  https://gitlab.com/iii-i/qemu/-/pipelines/743684604
* Drop unnecessary #includes (Alex).
* Drop the constification change (Alex/Richard).
* Split debuginfo support into a separate patch.
* Fix partial perfmap/jitdump files when terminating due to a signal.
* Fix debuginfo strings being accessed outside of debuginfo lock.
* Fix address resolution with TARGET_TB_PCREL.
* Add DEBUGINFOD_URLS= to the doc; without it perf inject is
  unacceptably slow.
* Note: it's better to test this with the latest perf
  (6.2.rc3.g7dd4b804e080 worked fine for me). There has been at least
  one breakage in the JIT area recently (fixed by 6d518ac7be62).

v1:
https://lists.nongnu.org/archive/html/qemu-devel/2022-10/msg01824.html
https://lists.nongnu.org/archive/html/qemu-devel/2022-11/msg01073.html

v1 -> v2:
* Use QEMU_LOCK_GUARD (Alex).
* Handle TARGET_TB_PCREL (Alex).
* Support ELF -kernels, add a note about this (Alex). Tested with
  qemu-system-x86_64 and Linux kernel - it's not fast, but it works.
* Minor const correctness and style improvements.

Ilya Leoshkevich (3):
  linux-user: Clean up when exiting due to a signal
  accel/tcg: Add debuginfo support
  tcg: add perfmap and jitdump

 accel/tcg/debuginfo.c |  96 ++
 accel/tcg/debuginfo.h |  77 
 accel/tcg/meson.build |   2 +
 accel/tcg/perf.c  | 375 ++
 accel/tcg/perf.h  |  49 +
 accel/tcg/translate-all.c |   7 +
 docs/devel/tcg.rst|  23 +++
 hw/core/loader.c  |   5 +
 linux-user/elfload.c  |   3 +
 linux-user/exit.c |   2 +
 linux-user/main.c |  15 ++
 linux-user/meson.build|   1 +
 linux-user/signal.c   |   8 +-
 meson.build   |   8 +
 qemu-options.hx   |  20 ++
 softmmu/vl.c  |  11 ++
 tcg/tcg.c |   2 +
 17 files changed, 701 insertions(+), 3 deletions(-)
 create mode 100644 accel/tcg/debuginfo.c
 create mode 100644 accel/tcg/debuginfo.h
 create mode 100644 accel/tcg/perf.c
 create mode 100644 accel/tcg/perf.h

-- 
2.39.0

[PATCH v4 3/3] tcg: add perfmap and jitdump

2023-01-12 Thread Ilya Leoshkevich

Add ability to dump /tmp/perf-.map and jit-.dump.
The first one allows the perf tool to map samples to each individual
translation block. The second one adds the ability to resolve symbol
names, line numbers and inspect JITed code.

Example of use:

perf record qemu-x86_64 -perfmap ./a.out
perf report

or

perf record -k 1 qemu-x86_64 -jitdump ./a.out
DEBUGINFOD_URLS= perf inject -j -i perf.data -o perf.data.jitted
perf report -i perf.data.jitted

Co-developed-by: Vanderson M. do Rosario 
Co-developed-by: Alex Bennée 
Signed-off-by: Ilya Leoshkevich 
---
 accel/tcg/meson.build |   1 +
 accel/tcg/perf.c  | 375 ++
 accel/tcg/perf.h  |  49 +
 accel/tcg/translate-all.c |   7 +
 docs/devel/tcg.rst|  23 +++
 linux-user/exit.c |   2 +
 linux-user/main.c |  15 ++
 qemu-options.hx   |  20 ++
 softmmu/vl.c  |  11 ++
 tcg/tcg.c |   2 +
 10 files changed, 505 insertions(+)
 create mode 100644 accel/tcg/perf.c
 create mode 100644 accel/tcg/perf.h

diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
index 55b3b4dd7e3..77740b1a0d7 100644
--- a/accel/tcg/meson.build
+++ b/accel/tcg/meson.build
@@ -13,6 +13,7 @@ tcg_ss.add(when: 'CONFIG_USER_ONLY', if_true: 
files('user-exec.c'))
 tcg_ss.add(when: 'CONFIG_SOFTMMU', if_false: files('user-exec-stub.c'))
 tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c')])
 tcg_ss.add(when: libdw, if_true: files('debuginfo.c'))
+tcg_ss.add(when: 'CONFIG_LINUX', if_true: files('perf.c'))
 specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
 
 specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
diff --git a/accel/tcg/perf.c b/accel/tcg/perf.c
new file mode 100644
index 000..ae19f6e28fc
--- /dev/null
+++ b/accel/tcg/perf.c
@@ -0,0 +1,375 @@
+/*
+ * Linux perf perf-.map and jit-.dump integration.
+ *
+ * The jitdump spec can be found at [1].
+ *
+ * [1] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/tools/perf/Documentation/jitdump-specification.txt
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "elf.h"
+#include "exec/exec-all.h"
+#include "qemu/timer.h"
+#include "tcg/tcg.h"
+
+#include "debuginfo.h"
+#include "perf.h"
+
+static FILE *safe_fopen_w(const char *path)
+{
+int saved_errno;
+FILE *f;
+int fd;
+
+/* Delete the old file, if any. */
+unlink(path);
+
+/* Avoid symlink attacks by using O_CREAT | O_EXCL. */
+fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
+if (fd == -1) {
+return NULL;
+}
+
+/* Convert fd to FILE*. */
+f = fdopen(fd, "w");
+if (f == NULL) {
+saved_errno = errno;
+close(fd);
+errno = saved_errno;
+return NULL;
+}
+
+return f;
+}
+
+static FILE *perfmap;
+
+void perf_enable_perfmap(void)
+{
+char map_file[32];
+
+snprintf(map_file, sizeof(map_file), "/tmp/perf-%d.map", getpid());
+perfmap = safe_fopen_w(map_file);
+if (perfmap == NULL) {
+warn_report("Could not open %s: %s, proceeding without perfmap",
+map_file, strerror(errno));
+}
+}
+
+/* Get PC and size of code JITed for guest instruction #INSN. */
+static void get_host_pc_size(uintptr_t *host_pc, uint16_t *host_size,
+ const void *start, size_t insn)
+{
+uint16_t start_off = insn ? tcg_ctx->gen_insn_end_off[insn - 1] : 0;
+
+if (host_pc) {
+*host_pc = (uintptr_t)start + start_off;
+}
+if (host_size) {
+*host_size = tcg_ctx->gen_insn_end_off[insn] - start_off;
+}
+}
+
+static const char *pretty_symbol(const struct debuginfo_query *q, size_t *len)
+{
+static __thread char buf[64];
+int tmp;
+
+if (!q->symbol) {
+tmp = snprintf(buf, sizeof(buf), "guest-0x%"PRIx64, q->address);
+if (len) {
+*len = MIN(tmp + 1, sizeof(buf));
+}
+return buf;
+}
+
+if (!q->offset) {
+if (len) {
+*len = strlen(q->symbol) + 1;
+}
+return q->symbol;
+}
+
+tmp = snprintf(buf, sizeof(buf), "%s+0x%"PRIx64, q->symbol, q->offset);
+if (len) {
+*len = MIN(tmp + 1, sizeof(buf));
+}
+return buf;
+}
+
+static void write_perfmap_entry(const void *start, size_t insn,
+const struct debuginfo_query *q)
+{
+uint16_t host_size;
+uintptr_t host_pc;
+
+get_host_pc_size(_pc, _size, start, insn);
+fprintf(perfmap, "%"PRIxPTR" %"PRIx16" %s\n",
+host_pc, host_size, pretty_symbol(q, NULL));
+}
+
+static FILE *jitdump;
+
+#define JITHEADER_MAGIC 0x4A695444
+#define JITHEADER_VERSION 1
+
+struct jitheader {
+uint32_t

[PATCH v4 1/3] linux-user: Clean up when exiting due to a signal

2023-01-12 Thread Ilya Leoshkevich

When exiting due to an exit() syscall, qemu-user calls
preexit_cleanup(), but this is currently not the case when exiting due
to a signal. This leads to various buffers not being flushed (e.g.,
for gprof, for gcov, and for the upcoming perf support).

Add the missing call.

Signed-off-by: Ilya Leoshkevich 
Reviewed-by: Alex Bennée 
Reviewed-by: Richard Henderson 
---
 linux-user/signal.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/linux-user/signal.c b/linux-user/signal.c
index 61c6fa3fcf1..098f3a787db 100644
--- a/linux-user/signal.c
+++ b/linux-user/signal.c
@@ -695,7 +695,7 @@ void cpu_loop_exit_sigbus(CPUState *cpu, target_ulong addr,
 
 /* abort execution with signal */
 static G_NORETURN
-void dump_core_and_abort(int target_sig)
+void dump_core_and_abort(CPUArchState *cpu_env, int target_sig)
 {
 CPUState *cpu = thread_cpu;
 CPUArchState *env = cpu->env_ptr;
@@ -724,6 +724,8 @@ void dump_core_and_abort(int target_sig)
 target_sig, strsignal(host_sig), "core dumped" );
 }
 
+preexit_cleanup(cpu_env, 128 + target_sig);
+
 /* The proper exit code for dying from an uncaught signal is
  * -.  The kernel doesn't allow exit() or _exit() to pass
  * a negative value.  To get the proper exit code we need to
@@ -1058,12 +1060,12 @@ static void handle_pending_signal(CPUArchState 
*cpu_env, int sig,
sig != TARGET_SIGURG &&
sig != TARGET_SIGWINCH &&
sig != TARGET_SIGCONT) {
-dump_core_and_abort(sig);
+dump_core_and_abort(cpu_env, sig);
 }
 } else if (handler == TARGET_SIG_IGN) {
 /* ignore sig */
 } else if (handler == TARGET_SIG_ERR) {
-dump_core_and_abort(sig);
+dump_core_and_abort(cpu_env, sig);
 } else {
 /* compute the blocked signals during the handler execution */
 sigset_t *blocked_set;
-- 
2.39.0

Re: [PATCH v3 3/3] tcg: add perfmap and jitdump

2023-01-11 Thread Ilya Leoshkevich

On Wed, 2023-01-11 at 02:47 +0100, Ilya Leoshkevich wrote:
> Add ability to dump /tmp/perf-.map and jit-.dump.
> The first one allows the perf tool to map samples to each individual
> translation block. The second one adds the ability to resolve symbol
> names, line numbers and inspect JITed code.
> 
> Example of use:
> 
>     perf record qemu-x86_64 -perfmap ./a.out
>     perf report
> 
> or
> 
>     perf record -k 1 qemu-x86_64 -jitdump ./a.out
>     DEBUGINFOD_URLS= perf inject -j -i perf.data -o perf.data.jitted
>     perf report -i perf.data.jitted
> 
> Co-developed-by: Vanderson M. do Rosario 
> Co-developed-by: Alex Bennée 
> Signed-off-by: Ilya Leoshkevich 
> ---
>  accel/tcg/meson.build |   1 +
>  accel/tcg/perf.c  | 366
> ++
>  accel/tcg/perf.h  |  49 +
>  accel/tcg/translate-all.c |   8 +
>  docs/devel/tcg.rst    |  23 +++
>  linux-user/exit.c |   2 +
>  linux-user/main.c |  15 ++
>  qemu-options.hx   |  20 +++
>  softmmu/vl.c  |  11 ++
>  tcg/tcg.c |   2 +
>  10 files changed, 497 insertions(+)
>  create mode 100644 accel/tcg/perf.c
>  create mode 100644 accel/tcg/perf.h

...

> +void perf_report_code(unsigned long long guest_pc, size_t icount,
> +  const void *start, size_t size)
> +{
> +    struct debuginfo_query *q;
> +    size_t insn;
> +
> +    if (!perfmap && !jitdump) {
> +    return;
> +    }
> +
> +    q = g_try_malloc0_n(icount, sizeof(*q));
> +    if (!q) {
> +    return;
> +    }
> +
> +    debuginfo_lock();
> +
> +    /* Query debuginfo for each guest instruction. */
> +    for (insn = 0; insn < icount; insn++) {
> +    q[insn].address = tcg_ctx->gen_insn_data[insn][0] +
> +  (TARGET_TB_PCREL ? guest_pc : 0);

Currently this produces plausibly looking, but actually wrong
addresses. This needs to match restore_state_to_opc(), so at least:

--- a/accel/tcg/perf.c
+++ b/accel/tcg/perf.c
@@ -325,8 +325,10 @@ void perf_report_code(unsigned long long guest_pc,
size_t icount,
 
 /* Query debuginfo for each guest instruction. */
 for (insn = 0; insn < icount; insn++) {
-q[insn].address = tcg_ctx->gen_insn_data[insn][0] +
-  (TARGET_TB_PCREL ? guest_pc : 0);
+q[insn].address = tcg_ctx->gen_insn_data[insn][0];
+if (TARGET_TB_PCREL) {
+q[insn].address |= (guest_pc & TARGET_PAGE_MASK);
+}
 q[insn].flags = DEBUGINFO_SYMBOL | (jitdump ? DEBUGINFO_LINE :
0);
 }
 debuginfo_query(q, icount);

Apparently even with this there are corner cases, e.g. in
x86_restore_state_to_opc() we have:

if (TARGET_TB_PCREL) {
env->eip = (env->eip & TARGET_PAGE_MASK) | data[0];
} else {
env->eip = data[0] - tb->cs_base;
}

so if tb->cs_base != 0, the result is still going to be wrong.

I wonder if it would make sense to create a new TCGCPUOps member
purely for resolving a PC from data[0]?

[PATCH v3 3/3] tcg: add perfmap and jitdump

2023-01-10 Thread Ilya Leoshkevich

Add ability to dump /tmp/perf-.map and jit-.dump.
The first one allows the perf tool to map samples to each individual
translation block. The second one adds the ability to resolve symbol
names, line numbers and inspect JITed code.

Example of use:

perf record qemu-x86_64 -perfmap ./a.out
perf report

or

perf record -k 1 qemu-x86_64 -jitdump ./a.out
DEBUGINFOD_URLS= perf inject -j -i perf.data -o perf.data.jitted
perf report -i perf.data.jitted

Co-developed-by: Vanderson M. do Rosario 
Co-developed-by: Alex Bennée 
Signed-off-by: Ilya Leoshkevich 
---
 accel/tcg/meson.build |   1 +
 accel/tcg/perf.c  | 366 ++
 accel/tcg/perf.h  |  49 +
 accel/tcg/translate-all.c |   8 +
 docs/devel/tcg.rst|  23 +++
 linux-user/exit.c |   2 +
 linux-user/main.c |  15 ++
 qemu-options.hx   |  20 +++
 softmmu/vl.c  |  11 ++
 tcg/tcg.c |   2 +
 10 files changed, 497 insertions(+)
 create mode 100644 accel/tcg/perf.c
 create mode 100644 accel/tcg/perf.h

diff --git a/accel/tcg/meson.build b/accel/tcg/meson.build
index 55b3b4dd7e3..77740b1a0d7 100644
--- a/accel/tcg/meson.build
+++ b/accel/tcg/meson.build
@@ -13,6 +13,7 @@ tcg_ss.add(when: 'CONFIG_USER_ONLY', if_true: 
files('user-exec.c'))
 tcg_ss.add(when: 'CONFIG_SOFTMMU', if_false: files('user-exec-stub.c'))
 tcg_ss.add(when: 'CONFIG_PLUGIN', if_true: [files('plugin-gen.c')])
 tcg_ss.add(when: libdw, if_true: files('debuginfo.c'))
+tcg_ss.add(when: 'CONFIG_LINUX', if_true: files('perf.c'))
 specific_ss.add_all(when: 'CONFIG_TCG', if_true: tcg_ss)
 
 specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: files(
diff --git a/accel/tcg/perf.c b/accel/tcg/perf.c
new file mode 100644
index 000..427ccbe80e1
--- /dev/null
+++ b/accel/tcg/perf.c
@@ -0,0 +1,366 @@
+/*
+ * Linux perf perf-.map and jit-.dump integration.
+ *
+ * The jitdump spec can be found at [1].
+ *
+ * [1] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/tools/perf/Documentation/jitdump-specification.txt
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "elf.h"
+#include "qemu/timer.h"
+#include "tcg/tcg.h"
+
+#include "debuginfo.h"
+#include "perf.h"
+
+static FILE *safe_fopen_w(const char *path)
+{
+int saved_errno;
+FILE *f;
+int fd;
+
+/* Delete the old file, if any. */
+unlink(path);
+
+/* Avoid symlink attacks by using O_CREAT | O_EXCL. */
+fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
+if (fd == -1) {
+return NULL;
+}
+
+/* Convert fd to FILE*. */
+f = fdopen(fd, "w");
+if (f == NULL) {
+saved_errno = errno;
+close(fd);
+errno = saved_errno;
+return NULL;
+}
+
+return f;
+}
+
+static FILE *perfmap;
+
+void perf_enable_perfmap(void)
+{
+char map_file[32];
+
+snprintf(map_file, sizeof(map_file), "/tmp/perf-%d.map", getpid());
+perfmap = safe_fopen_w(map_file);
+if (perfmap == NULL) {
+warn_report("Could not open %s: %s, proceeding without perfmap",
+map_file, strerror(errno));
+}
+}
+
+/* Get PC and size of code JITed for guest instruction #INSN. */
+static void get_host_pc_size(uintptr_t *host_pc, uint16_t *host_size,
+ const void *start, size_t insn)
+{
+uint16_t start_off = insn ? tcg_ctx->gen_insn_end_off[insn - 1] : 0;
+
+if (host_pc) {
+*host_pc = (uintptr_t)start + start_off;
+}
+if (host_size) {
+*host_size = tcg_ctx->gen_insn_end_off[insn] - start_off;
+}
+}
+
+static const char *pretty_symbol(const struct debuginfo_query *q, size_t *len)
+{
+static __thread char buf[64];
+int tmp;
+
+if (!q->symbol) {
+tmp = snprintf(buf, sizeof(buf), "guest-0x%llx", q->address);
+if (len) {
+*len = MIN(tmp + 1, sizeof(buf));
+}
+return buf;
+}
+
+if (!q->offset) {
+if (len) {
+*len = strlen(q->symbol) + 1;
+}
+return q->symbol;
+}
+
+tmp = snprintf(buf, sizeof(buf), "%s+0x%llx", q->symbol, q->offset);
+if (len) {
+*len = MIN(tmp + 1, sizeof(buf));
+}
+return buf;
+}
+
+static void write_perfmap_entry(const void *start, size_t insn,
+const struct debuginfo_query *q)
+{
+uint16_t host_size;
+uintptr_t host_pc;
+
+get_host_pc_size(_pc, _size, start, insn);
+fprintf(perfmap, "%"PRIxPTR" %"PRIx16" %s\n",
+host_pc, host_size, pretty_symbol(q, NULL));
+}
+
+static FILE *jitdump;
+
+#define JITHEADER_MAGIC 0x4A695444
+#define JITHEADER_VERSION 1
+
+struct jitheader {
+uint32_t magic;
+uint32_t version;
+uint32_t to

[PATCH v3 2/3] accel/tcg: Add debuginfo support

2023-01-10 Thread Ilya Leoshkevich

Add libdw-based functions for loading and querying debuginfo. Load
debuginfo from the system and the linux-user loaders.

This is useful for the upcoming perf support, which can then put
human-readable guest symbols instead of raw guest PCs into perfmap and
jitdump files.

Signed-off-by: Ilya Leoshkevich 
---
 accel/tcg/debuginfo.c  | 96 ++
 accel/tcg/debuginfo.h  | 77 +
 accel/tcg/meson.build  |  1 +
 hw/core/loader.c   |  5 +++
 linux-user/elfload.c   |  3 ++
 linux-user/meson.build |  1 +
 meson.build|  8 
 7 files changed, 191 insertions(+)
 create mode 100644 accel/tcg/debuginfo.c
 create mode 100644 accel/tcg/debuginfo.h

diff --git a/accel/tcg/debuginfo.c b/accel/tcg/debuginfo.c
new file mode 100644
index 000..fee98a8e867
--- /dev/null
+++ b/accel/tcg/debuginfo.c
@@ -0,0 +1,96 @@
+/*
+ * Debug information support.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/lockable.h"
+
+#include 
+
+#include "debuginfo.h"
+
+static QemuMutex lock;
+static Dwfl *dwfl;
+static const Dwfl_Callbacks dwfl_callbacks = {
+.find_elf = NULL,
+.find_debuginfo = dwfl_standard_find_debuginfo,
+.section_address = NULL,
+.debuginfo_path = NULL,
+};
+
+__attribute__((constructor))
+static void debuginfo_init(void)
+{
+qemu_mutex_init();
+}
+
+void debuginfo_report_elf(const char *name, int fd, unsigned long long bias)
+{
+QEMU_LOCK_GUARD();
+
+if (dwfl) {
+dwfl_report_begin_add(dwfl);
+} else {
+dwfl = dwfl_begin(_callbacks);
+}
+
+if (dwfl) {
+dwfl_report_elf(dwfl, name, name, fd, bias, true);
+dwfl_report_end(dwfl, NULL, NULL);
+}
+}
+
+void debuginfo_lock(void)
+{
+qemu_mutex_lock();
+}
+
+void debuginfo_query(struct debuginfo_query *q, size_t n)
+{
+const char *symbol, *file;
+Dwfl_Module *dwfl_module;
+Dwfl_Line *dwfl_line;
+GElf_Off dwfl_offset;
+GElf_Sym dwfl_sym;
+size_t i;
+int line;
+
+if (!dwfl) {
+return;
+}
+
+for (i = 0; i < n; i++) {
+dwfl_module = dwfl_addrmodule(dwfl, q[i].address);
+if (!dwfl_module) {
+continue;
+}
+
+if (q[i].flags & DEBUGINFO_SYMBOL) {
+symbol = dwfl_module_addrinfo(dwfl_module, q[i].address,
+  _offset, _sym,
+  NULL, NULL, NULL);
+if (symbol) {
+q[i].symbol = symbol;
+q[i].offset = dwfl_offset;
+}
+}
+
+if (q[i].flags & DEBUGINFO_LINE) {
+dwfl_line = dwfl_module_getsrc(dwfl_module, q[i].address);
+if (dwfl_line) {
+file = dwfl_lineinfo(dwfl_line, NULL, , 0, NULL, NULL);
+if (file) {
+q[i].file = file;
+q[i].line = line;
+}
+}
+}
+}
+}
+
+void debuginfo_unlock(void)
+{
+qemu_mutex_unlock();
+}
diff --git a/accel/tcg/debuginfo.h b/accel/tcg/debuginfo.h
new file mode 100644
index 000..50b7a0f8471
--- /dev/null
+++ b/accel/tcg/debuginfo.h
@@ -0,0 +1,77 @@
+/*
+ * Debug information support.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef ACCEL_TCG_DEBUGINFO_H
+#define ACCEL_TCG_DEBUGINFO_H
+
+/*
+ * Debuginfo describing a certain address.
+ */
+struct debuginfo_query {
+unsigned long long address;  /* Input: address. */
+int flags;   /* Input: debuginfo subset. */
+const char *symbol;  /* Symbol that the address is part of. */
+unsigned long long offset;   /* Offset from the symbol. */
+const char *file;/* Source file associated with the address. */
+int line;/* Line number in the source file. */
+};
+
+/*
+ * Debuginfo subsets.
+ */
+#define DEBUGINFO_SYMBOL BIT(1)
+#define DEBUGINFO_LINE   BIT(2)
+
+#if defined(CONFIG_TCG) && defined(CONFIG_LIBDW)
+/*
+ * Load debuginfo for the specified guest ELF image.
+ * Return true on success, false on failure.
+ */
+void debuginfo_report_elf(const char *name, int fd, unsigned long long bias);
+
+/*
+ * Take the debuginfo lock.
+ */
+void debuginfo_lock(void);
+
+/*
+ * Fill each on N Qs with the debuginfo about Q->ADDRESS as specified by
+ * Q->FLAGS:
+ *
+ * - DEBUGINFO_SYMBOL: update Q->SYMBOL and Q->OFFSET. If symbol debuginfo is
+ * missing, then leave them as is.
+ * - DEBUINFO_LINE: update Q->FILE and Q->LINE. If line debuginfo is missing,
+ *  then leave them as is.
+ *
+ * This function must be called under the debuginfo lock. The results can be
+ * accessed only until the debuginfo lock is released.
+ */
+void debuginfo_query(struct debuginfo_query *q, size_t n);
+
+/*
+ * Release the debuginfo lock.
+ */
+void debu

[PATCH v3 1/3] linux-user: Clean up when exiting due to a signal

2023-01-10 Thread Ilya Leoshkevich

When exiting due to an exit() syscall, qemu-user calls
preexit_cleanup(), but this is currently not the case when exiting due
to a signal. This leads to various buffers not being flushed (e.g.,
for gprof, for gcov, and for the upcoming perf support).

Add the missing call.

Signed-off-by: Ilya Leoshkevich 
---
 linux-user/signal.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/linux-user/signal.c b/linux-user/signal.c
index 61c6fa3fcf1..098f3a787db 100644
--- a/linux-user/signal.c
+++ b/linux-user/signal.c
@@ -695,7 +695,7 @@ void cpu_loop_exit_sigbus(CPUState *cpu, target_ulong addr,
 
 /* abort execution with signal */
 static G_NORETURN
-void dump_core_and_abort(int target_sig)
+void dump_core_and_abort(CPUArchState *cpu_env, int target_sig)
 {
 CPUState *cpu = thread_cpu;
 CPUArchState *env = cpu->env_ptr;
@@ -724,6 +724,8 @@ void dump_core_and_abort(int target_sig)
 target_sig, strsignal(host_sig), "core dumped" );
 }
 
+preexit_cleanup(cpu_env, 128 + target_sig);
+
 /* The proper exit code for dying from an uncaught signal is
  * -.  The kernel doesn't allow exit() or _exit() to pass
  * a negative value.  To get the proper exit code we need to
@@ -1058,12 +1060,12 @@ static void handle_pending_signal(CPUArchState 
*cpu_env, int sig,
sig != TARGET_SIGURG &&
sig != TARGET_SIGWINCH &&
sig != TARGET_SIGCONT) {
-dump_core_and_abort(sig);
+dump_core_and_abort(cpu_env, sig);
 }
 } else if (handler == TARGET_SIG_IGN) {
 /* ignore sig */
 } else if (handler == TARGET_SIG_ERR) {
-dump_core_and_abort(sig);
+dump_core_and_abort(cpu_env, sig);
 } else {
 /* compute the blocked signals during the handler execution */
 sigset_t *blocked_set;
-- 
2.39.0

[PATCH v3 0/3] tcg: add perfmap and jitdump

2023-01-10 Thread Ilya Leoshkevich

v2:
https://lists.gnu.org/archive/html/qemu-devel/2022-11/msg02385.html
https://lists.gnu.org/archive/html/qemu-devel/2023-01/msg01026.html

v2 -> v3:
* Enable only for CONFIG_LINUX (Alex).
* Use qemu_get_thread_id() instead of gettid() (Alex).
* Fix CI (Alex).
  https://gitlab.com/iii-i/qemu/-/pipelines/743684604
* Drop unnecessary #includes (Alex).
* Drop the constification change (Alex/Richard).
* Split debuginfo support into a separate patch.
* Fix partial perfmap/jitdump files when terminating due to a signal.
* Fix debuginfo strings being accessed outside of debuginfo lock.
* Fix address resolution with TARGET_TB_PCREL.
* Add DEBUGINFOD_URLS= to the doc; without it perf inject is
  unacceptably slow.
* Note: it's better to test this with the latest perf
  (6.2.rc3.g7dd4b804e080 worked fine for me). There has been at least
  one breakage in the JIT area recently (fixed by 6d518ac7be62).

v1:
https://lists.nongnu.org/archive/html/qemu-devel/2022-10/msg01824.html
https://lists.nongnu.org/archive/html/qemu-devel/2022-11/msg01073.html

v1 -> v2:
* Use QEMU_LOCK_GUARD (Alex).
* Handle TARGET_TB_PCREL (Alex).
* Support ELF -kernels, add a note about this (Alex). Tested with
  qemu-system-x86_64 and Linux kernel - it's not fast, but it works.
* Minor const correctness and style improvements.

Ilya Leoshkevich (3):
  linux-user: Clean up when exiting due to a signal
  accel/tcg: Add debuginfo support
  tcg: add perfmap and jitdump

 accel/tcg/debuginfo.c |  96 ++
 accel/tcg/debuginfo.h |  77 
 accel/tcg/meson.build |   2 +
 accel/tcg/perf.c  | 366 ++
 accel/tcg/perf.h  |  49 +
 accel/tcg/translate-all.c |   8 +
 docs/devel/tcg.rst|  23 +++
 hw/core/loader.c  |   5 +
 linux-user/elfload.c  |   3 +
 linux-user/exit.c |   2 +
 linux-user/main.c |  15 ++
 linux-user/meson.build|   1 +
 linux-user/signal.c   |   8 +-
 meson.build   |   8 +
 qemu-options.hx   |  20 +++
 softmmu/vl.c  |  11 ++
 tcg/tcg.c |   2 +
 17 files changed, 693 insertions(+), 3 deletions(-)
 create mode 100644 accel/tcg/debuginfo.c
 create mode 100644 accel/tcg/debuginfo.h
 create mode 100644 accel/tcg/perf.c
 create mode 100644 accel/tcg/perf.h

-- 
2.39.0

PING: [PATCH v2 0/1] tcg: add perfmap and jitdump

2023-01-03 Thread Ilya Leoshkevich

On Mon, 2022-11-14 at 17:13 +0100, Ilya Leoshkevich wrote:
> v1:
> https://lists.nongnu.org/archive/html/qemu-devel/2022-10/msg01824.html
> https://lists.nongnu.org/archive/html/qemu-devel/2022-11/msg01073.html
> 
> v1 -> v2:
> * Use QEMU_LOCK_GUARD (Alex).
> * Handle TARGET_TB_PCREL (Alex).
> * Support ELF -kernels, add a note about this (Alex). Tested with
>   qemu-system-x86_64 and Linux kernel - it's not fast, but it works.
> * Minor const correctness and style improvements.
> 
> Ilya Leoshkevich (1):
>   tcg: add perfmap and jitdump
> 
>  accel/tcg/debuginfo.c |  99 +++
>  accel/tcg/debuginfo.h |  52 ++
>  accel/tcg/meson.build |   2 +
>  accel/tcg/perf.c  | 334
> ++
>  accel/tcg/perf.h  |  28 
>  accel/tcg/translate-all.c |   8 +
>  docs/devel/tcg.rst    |  23 +++
>  hw/core/loader.c  |   5 +
>  include/tcg/tcg.h |   4 +-
>  linux-user/elfload.c  |   3 +
>  linux-user/exit.c |   2 +
>  linux-user/main.c |  15 ++
>  linux-user/meson.build    |   1 +
>  meson.build   |   8 +
>  qemu-options.hx   |  20 +++
>  softmmu/vl.c  |  11 ++
>  tcg/region.c  |   2 +-
>  tcg/tcg.c |   2 +
>  18 files changed, 616 insertions(+), 3 deletions(-)
>  create mode 100644 accel/tcg/debuginfo.c
>  create mode 100644 accel/tcg/debuginfo.h
>  create mode 100644 accel/tcg/perf.c
>  create mode 100644 accel/tcg/perf.h
> 

Hi,

I would like to ping this patch:

https://lists.gnu.org/archive/html/qemu-devel/2022-11/msg02385.html
https://patchew.org/QEMU/20221114161321.3364875-1-...@linux.ibm.com/

Best regards,
Ilya

Re: [PULL v2 07/14] accel/tcg: Use interval tree for user-only page tracking

2022-12-23 Thread Ilya Leoshkevich

On Tue, Dec 20, 2022 at 09:03:06PM -0800, Richard Henderson wrote:
> Finish weaning user-only away from PageDesc.
> 
> Using an interval tree to track page permissions means that
> we can represent very large regions efficiently.
> 
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/290
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/967
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1214
> Reviewed-by: Alex Bennée 
> Signed-off-by: Richard Henderson 
> ---
>  accel/tcg/internal.h   |   4 +-
>  accel/tcg/tb-maint.c   |  20 +-
>  accel/tcg/user-exec.c  | 615 ++---
>  tests/tcg/multiarch/test-vma.c |  22 ++
>  4 files changed, 451 insertions(+), 210 deletions(-)
>  create mode 100644 tests/tcg/multiarch/test-vma.c

Hi,

After staring at vma-pthread.c failures for some time, I finally
spotted a few lines here that look suspicious.



>  int page_get_flags(target_ulong address)
>  {
> -PageDesc *p;
> +PageFlagsNode *p = pageflags_find(address, address);
>  
> -p = page_find(address >> TARGET_PAGE_BITS);
> -if (!p) {
> +/*
> + * See util/interval-tree.c re lockless lookups: no false positives but
> + * there are false negatives.  If we find nothing, retry with the mmap
> + * lock acquired.
> + */
> +if (p) {
> +return p->flags;
> +}
> +if (have_mmap_lock()) {
>  return 0;
>  }
> -return p->flags;
> +
> +mmap_lock();
> +p = pageflags_find(address, address);
> +mmap_unlock();

How does the code ensure that p is not freed here?

> +return p ? p->flags : 0;
> +}



>  int page_check_range(target_ulong start, target_ulong len, int flags)
>  {
> -PageDesc *p;
> -target_ulong end;
> -target_ulong addr;
> -
> -/*
> - * This function should never be called with addresses outside the
> - * guest address space.  If this assert fires, it probably indicates
> - * a missing call to h2g_valid.
> - */
> -if (TARGET_ABI_BITS > L1_MAP_ADDR_SPACE_BITS) {
> -assert(start < ((target_ulong)1 << L1_MAP_ADDR_SPACE_BITS));
> -}
> +target_ulong last;
>  
>  if (len == 0) {
> -return 0;
> -}
> -if (start + len - 1 < start) {
> -/* We've wrapped around.  */
> -return -1;
> +return 0;  /* trivial length */
>  }
>  
> -/* must do before we loose bits in the next step */
> -end = TARGET_PAGE_ALIGN(start + len);
> -start = start & TARGET_PAGE_MASK;
> +last = start + len - 1;
> +if (last < start) {
> +return -1; /* wrap around */
> +}
> +
> +while (true) {
> +PageFlagsNode *p = pageflags_find(start, last);

We can end up here without mmap_lock if we come from the syscall code.
Do we need a retry like in page_get_flags()?
Or would it make sense to just take mmap_lock in lock_user()?

Speaking of which: does lock_user() actually guarantee that it's safe
to access the respective pages until unlock_user()? If yes, doesn't
this mean that mmap_lock must be held between the two? And if no, and
the SEGV handler is already supposed to gracefully handle SEGVs in
syscall.c, do we need to call access_ok_untagged() there at all?

> +int missing;
>  
> -for (addr = start, len = end - start;
> - len != 0;
> - len -= TARGET_PAGE_SIZE, addr += TARGET_PAGE_SIZE) {
> -p = page_find(addr >> TARGET_PAGE_BITS);
>  if (!p) {
> -return -1;
> +return -1; /* entire region invalid */
>  }
> -if (!(p->flags & PAGE_VALID)) {
> -return -1;
> +if (start < p->itree.start) {
> +return -1; /* initial bytes invalid */
>  }
>  
> -if ((flags & PAGE_READ) && !(p->flags & PAGE_READ)) {
> -return -1;
> +missing = flags & ~p->flags;
> +if (missing & PAGE_READ) {
> +return -1; /* page not readable */
>  }
> -if (flags & PAGE_WRITE) {
> +if (missing & PAGE_WRITE) {
>  if (!(p->flags & PAGE_WRITE_ORG)) {
> +return -1; /* page not writable */
> +}
> +/* Asking about writable, but has been protected: undo. */
> +if (!page_unprotect(start, 0)) {
>  return -1;
>  }
> -/* unprotect the page if it was put read-only because it
> -   contains translated code */
> -if (!(p->flags & PAGE_WRITE)) {
> -if (!page_unprotect(addr, 0)) {
> -return -1;
> -}
> +/* TODO: page_unprotect should take a range, not a single page. 
> */
> +if (last - start < TARGET_PAGE_SIZE) {
> +return 0; /* ok */
>  }
> +start += TARGET_PAGE_SIZE;
> +continue;
>  }
> +
> +if (last <= p->itree.last) {
> +return 0; /* ok */
> +}
> +

Re: [PATCH] tests/tcg/multiarch: add vma-pthread.c

2022-12-23 Thread Ilya Leoshkevich

On Fri, 2022-12-23 at 13:02 +0100, Ilya Leoshkevich wrote:
> Add a test that locklessly changes and exercises page protection bits
> from various threads. This helps catch race conditions in the VMA
> handling.
> 
> Signed-off-by: Ilya Leoshkevich 
> ---
>  tests/tcg/multiarch/Makefile.target  |   3 +
>  tests/tcg/multiarch/munmap-pthread.c |  16 +--
>  tests/tcg/multiarch/nop_func.h   |  25 
>  tests/tcg/multiarch/vma-pthread.c    | 185
> +++
>  4 files changed, 214 insertions(+), 15 deletions(-)
>  create mode 100644 tests/tcg/multiarch/nop_func.h
>  create mode 100644 tests/tcg/multiarch/vma-pthread.c

This was meant to be a reply to the bug report for [1], but apparently
I forgot to Cc: the mailing list. Copying the original message here:

---
Hi,

Wasmtime testsuite started failing randomly, complaining that
clock_gettime() returns -EFAULT. Bisect points to this commit.

I could not see anything obviously wrong here with the manual review,
and the failure was not reproducible when running individual testcases
or using strace. So I wrote a stress test (which I will post shortly),
which runs fine on the host, but reproduces the issue with qemu-user.

When run with -strace, it also triggers an assertion:

qemu-x86_64: ../accel/tcg/tb-maint.c:595:
tb_invalidate_phys_page_unwind: Assertion `pc != 0' failed.
qemu-x86_64: /home/iii/qemu/include/qemu/rcu.h:102:
rcu_read_unlock: Assertion `p_rcu_reader->depth != 0' failed.

I haven't tried analyzing what is causing all this yet, but at least
now the reproducer is small (~200LOC) and fails faster than 1s.

Best regards,
Ilya
---

[1] https://lists.gnu.org/archive/html/qemu-devel/2022-12/msg03615.html

[PATCH] tests/tcg/multiarch: add vma-pthread.c

2022-12-23 Thread Ilya Leoshkevich

Add a test that locklessly changes and exercises page protection bits
from various threads. This helps catch race conditions in the VMA
handling.

Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/multiarch/Makefile.target  |   3 +
 tests/tcg/multiarch/munmap-pthread.c |  16 +--
 tests/tcg/multiarch/nop_func.h   |  25 
 tests/tcg/multiarch/vma-pthread.c| 185 +++
 4 files changed, 214 insertions(+), 15 deletions(-)
 create mode 100644 tests/tcg/multiarch/nop_func.h
 create mode 100644 tests/tcg/multiarch/vma-pthread.c

diff --git a/tests/tcg/multiarch/Makefile.target 
b/tests/tcg/multiarch/Makefile.target
index 5f0fee1aadb..e7213af4925 100644
--- a/tests/tcg/multiarch/Makefile.target
+++ b/tests/tcg/multiarch/Makefile.target
@@ -39,6 +39,9 @@ signals: LDFLAGS+=-lrt -lpthread
 munmap-pthread: CFLAGS+=-pthread
 munmap-pthread: LDFLAGS+=-pthread
 
+vma-pthread: CFLAGS+=-pthread
+vma-pthread: LDFLAGS+=-pthread
+
 # We define the runner for test-mmap after the individual
 # architectures have defined their supported pages sizes. If no
 # additional page sizes are defined we only run the default test.
diff --git a/tests/tcg/multiarch/munmap-pthread.c 
b/tests/tcg/multiarch/munmap-pthread.c
index d7143b00d5f..1c79005846d 100644
--- a/tests/tcg/multiarch/munmap-pthread.c
+++ b/tests/tcg/multiarch/munmap-pthread.c
@@ -7,21 +7,7 @@
 #include 
 #include 
 
-static const char nop_func[] = {
-#if defined(__aarch64__)
-0xc0, 0x03, 0x5f, 0xd6, /* ret */
-#elif defined(__alpha__)
-0x01, 0x80, 0xFA, 0x6B, /* ret */
-#elif defined(__arm__)
-0x1e, 0xff, 0x2f, 0xe1, /* bx lr */
-#elif defined(__riscv)
-0x67, 0x80, 0x00, 0x00, /* ret */
-#elif defined(__s390__)
-0x07, 0xfe, /* br %r14 */
-#elif defined(__i386__) || defined(__x86_64__)
-0xc3,   /* ret */
-#endif
-};
+#include "nop_func.h"
 
 static void *thread_mmap_munmap(void *arg)
 {
diff --git a/tests/tcg/multiarch/nop_func.h b/tests/tcg/multiarch/nop_func.h
new file mode 100644
index 000..f714d21
--- /dev/null
+++ b/tests/tcg/multiarch/nop_func.h
@@ -0,0 +1,25 @@
+/*
+ * No-op functions that can be safely copied.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#ifndef NOP_FUNC_H
+#define NOP_FUNC_H
+
+static const char nop_func[] = {
+#if defined(__aarch64__)
+0xc0, 0x03, 0x5f, 0xd6, /* ret */
+#elif defined(__alpha__)
+0x01, 0x80, 0xFA, 0x6B, /* ret */
+#elif defined(__arm__)
+0x1e, 0xff, 0x2f, 0xe1, /* bx lr */
+#elif defined(__riscv)
+0x67, 0x80, 0x00, 0x00, /* ret */
+#elif defined(__s390__)
+0x07, 0xfe, /* br %r14 */
+#elif defined(__i386__) || defined(__x86_64__)
+0xc3,   /* ret */
+#endif
+};
+
+#endif
diff --git a/tests/tcg/multiarch/vma-pthread.c 
b/tests/tcg/multiarch/vma-pthread.c
new file mode 100644
index 000..c405cd46329
--- /dev/null
+++ b/tests/tcg/multiarch/vma-pthread.c
@@ -0,0 +1,185 @@
+/*
+ * Test that VMA updates do not race.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Map a contiguous chunk of RWX memory. Split it into 8 equally sized
+ * regions, each of which is guaranteed to have a certain combination of
+ * protection bits set.
+ *
+ * Reader, writer and executor threads perform the respective operations on
+ * pages, which are guaranteed to have the respective protection bit set.
+ * Two mutator threads change the non-fixed protection bits randomly.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "nop_func.h"
+
+#define PAGE_IDX_BITS 10
+#define PAGE_COUNT (1 << PAGE_IDX_BITS)
+#define PAGE_IDX_MASK (PAGE_COUNT - 1)
+#define REGION_IDX_BITS 3
+#define PAGE_IDX_R_MASK (1 << 7)
+#define PAGE_IDX_W_MASK (1 << 8)
+#define PAGE_IDX_X_MASK (1 << 9)
+#define REGION_MASK (PAGE_IDX_R_MASK | PAGE_IDX_W_MASK | PAGE_IDX_X_MASK)
+#define PAGES_PER_REGION (1 << (PAGE_IDX_BITS - REGION_IDX_BITS))
+
+struct context {
+int pagesize;
+char *ptr;
+int dev_null_fd;
+volatile int mutator_count;
+};
+
+static void *thread_read(void *arg)
+{
+struct context *ctx = arg;
+ssize_t sret;
+size_t i, j;
+int ret;
+
+for (i = 0; ctx->mutator_count; i++) {
+j = (i & PAGE_IDX_MASK) | PAGE_IDX_R_MASK;
+/* Read directly. */
+ret = memcmp(>ptr[j * ctx->pagesize], nop_func, sizeof(nop_func));
+assert(ret == 0);
+/* Read indirectly. */
+sret = write(ctx->dev_null_fd, >ptr[j * ctx->pagesize], 1);
+assert(sret == 1);
+}
+
+return NULL;
+}
+
+static void *thread_write(void *arg)
+{
+struct context *ctx = arg;
+struct timespec *ts;
+size_t i, j;
+int ret;
+
+for (i = 0; ctx->mutator_count; i++) {
+j = (i & PAGE_IDX_MASK) | PAGE_IDX_W_MASK;
+/* Write directly. */
+memcpy(>ptr[

[PATCH RFC v2 0/1] tests: add wasmtime testsuite

2022-12-21 Thread Ilya Leoshkevich

Hi,

I made some updates based on the feedback from Alex.

At the moment it mostly works for me on top of 6394578984da: aarch64,
riscv64 and s390x are clean, but there are some failures on x86_64.
With qemu-user vma rework it unfortunately fails in more places;
I haven't analyzed these failures yet.

v1: https://lists.gnu.org/archive/html/qemu-devel/2022-07/msg00657.html
https://lists.gnu.org/archive/html/qemu-devel/2022-12/msg02612.html

v1 -> v2:
* Drop gitlab jobs.
* Move to tests/third-party/.
* Use avocado. To be honest, I'm not sure how much value it brings
  here; I hoped that TAPRunner would at least print the number of
  passed tests, but it only checks whether at least one test failed).
* Add various check-* Makefile targets.
* Add json -> TAP 14 conversion.
* Add documentation.
* Move test binaries to the host system.
  This prevents unnecessary full rebuilds of the Docker image.
* Add riscv64; bump Rust and Wasmtime versions.
* Do not use docker.py; unfortunately this leads to reimplementing some
  of its features: docker command detection based on $(ENGINE) and
  injecting the current user into the image.
* Disable core dumps.

Best regards,
Ilya

Ilya Leoshkevich (1):
  tests: add wasmtime testsuite

 Makefile |  1 +
 docs/devel/testing.rst   |  9 +++
 tests/Makefile.include   |  6 ++
 tests/third-party/Makefile.include   | 50 
 tests/third-party/wasmtime/Dockerfile| 32 
 tests/third-party/wasmtime/Makefile.include  | 49 
 tests/third-party/wasmtime/avocado-wrapper   | 38 +
 tests/third-party/wasmtime/avocado.cfg   |  3 +
 tests/third-party/wasmtime/json2tap  | 77 ++
 tests/third-party/wasmtime/run-tests-wrapper | 82 
 tests/third-party/wasmtime/test  | 48 
 11 files changed, 395 insertions(+)
 create mode 100644 tests/third-party/Makefile.include
 create mode 100644 tests/third-party/wasmtime/Dockerfile
 create mode 100644 tests/third-party/wasmtime/Makefile.include
 create mode 100755 tests/third-party/wasmtime/avocado-wrapper
 create mode 100644 tests/third-party/wasmtime/avocado.cfg
 create mode 100755 tests/third-party/wasmtime/json2tap
 create mode 100755 tests/third-party/wasmtime/run-tests-wrapper
 create mode 100755 tests/third-party/wasmtime/test

-- 
2.38.1

[PATCH RFC v2 1/1] tests: add wasmtime testsuite

2022-12-21 Thread Ilya Leoshkevich

wasmtime is a WebAssembly runtime, which includes a large testsuite.
This testsuite uses qemu-user (aarch64, riscv64, s390x and x86_64 are
supported) in order to exercise foreign architectures. Over time it
found several regressions in qemu itself, and it would be beneficial to
catch the similar ones earlier.

To this end, this patch introduces Makefile targets that run stable
wasmtime testsuite against qemu-{aarch64,riscv64,s390x,x86_64}.

Signed-off-by: Ilya Leoshkevich 
---
 Makefile |  1 +
 docs/devel/testing.rst   |  9 +++
 tests/Makefile.include   |  6 ++
 tests/third-party/Makefile.include   | 50 
 tests/third-party/wasmtime/Dockerfile| 32 
 tests/third-party/wasmtime/Makefile.include  | 49 
 tests/third-party/wasmtime/avocado-wrapper   | 38 +
 tests/third-party/wasmtime/avocado.cfg   |  3 +
 tests/third-party/wasmtime/json2tap  | 77 ++
 tests/third-party/wasmtime/run-tests-wrapper | 82 
 tests/third-party/wasmtime/test  | 48 
 11 files changed, 395 insertions(+)
 create mode 100644 tests/third-party/Makefile.include
 create mode 100644 tests/third-party/wasmtime/Dockerfile
 create mode 100644 tests/third-party/wasmtime/Makefile.include
 create mode 100755 tests/third-party/wasmtime/avocado-wrapper
 create mode 100644 tests/third-party/wasmtime/avocado.cfg
 create mode 100755 tests/third-party/wasmtime/json2tap
 create mode 100755 tests/third-party/wasmtime/run-tests-wrapper
 create mode 100755 tests/third-party/wasmtime/test

diff --git a/Makefile b/Makefile
index a48103cc8a1..3a5250d3e46 100644
--- a/Makefile
+++ b/Makefile
@@ -287,6 +287,7 @@ export DESTDIR
 include $(SRC_PATH)/tests/lcitool/Makefile.include
 include $(SRC_PATH)/tests/docker/Makefile.include
 include $(SRC_PATH)/tests/vm/Makefile.include
+include $(SRC_PATH)/tests/third-party/Makefile.include
 
 print-help-run = printf "  %-30s - %s\\n" "$1" "$2"
 print-help = @$(call print-help-run,$1,$2)
diff --git a/docs/devel/testing.rst b/docs/devel/testing.rst
index e10c47b5a7c..0cbbcae4bcd 100644
--- a/docs/devel/testing.rst
+++ b/docs/devel/testing.rst
@@ -1475,3 +1475,12 @@ coverage-html`` which will create
 Further analysis can be conducted by running the ``gcov`` command
 directly on the various .gcda output files. Please read the ``gcov``
 documentation for more information.
+
+Third-party testsuites
+~~
+
+Various third-party projects contain testsuites that exercise non-trivial
+machine instructions or Linux kernel interfaces. While they may be too big for
+the gating CI, developers can use the corresponding make targets to verify
+their changes. The list of these targets can be found in the "Third-party
+testing targets" section of ``make check-help`` output.
diff --git a/tests/Makefile.include b/tests/Makefile.include
index 9422ddaece5..506fe5ab16a 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -22,6 +22,12 @@ endif
@echo " $(MAKE) check-venv Creates a Python venv for tests"
@echo " $(MAKE) check-cleanClean the tests and related data"
@echo
+   @echo "Third-party testing targets:"
+   @echo " $(MAKE) check-third-party Run third-party tests for all 
supported targets"
+   @echo " $(MAKE) check-third-party-TARGET  Run third-party tests for 
given target"
+   @echo " $(MAKE) check-wasmtimeRun wasmtime tests for all 
supported targets"
+   @echo " $(MAKE) check-wasmtime-TARGET Run wasmtime tests for given 
target"
+   @echo
@echo "The following are useful for CI builds"
@echo " $(MAKE) check-buildBuild most test binaries"
@echo " $(MAKE) get-vm-images  Downloads all images used by 
avocado tests, according to configured targets (~350 MB each, 1.5 GB max)"
diff --git a/tests/third-party/Makefile.include 
b/tests/third-party/Makefile.include
new file mode 100644
index 000..2d45d7c4260
--- /dev/null
+++ b/tests/third-party/Makefile.include
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+# Makefile for testing QEMU with third-party testsuites.
+
+# Known docker-like commands.
+DOCKER_COMMAND_PODMAN = podman
+DOCKER_COMMAND_DOCKER = docker
+DOCKER_COMMAND_SUDO_DOCKER = sudo -n docker
+DOCKER_COMMAND_NONE = /bin/false
+
+# Mapping from $(ENGINE) to the respective docker-like commands.
+DOCKER_COMMAND_CANDIDATES_auto = PODMAN DOCKER SUDO_DOCKER
+DOCKER_COMMAND_CANDIDATES_podman = PODMAN
+DOCKER_COMMAND_CANDIDATES_docker = DOCKER SUDO_DOCKER
+
+# Select the docker-like command based on $(ENGINE).
+DOCKER_COMMAND = $(DOCKER_COMMAND_NONE)
+define try_docker_command
+ifeq ($(DOCKER_COMMAND),$(DOCKER_COMM

Re: [RFC] gitlab: introduce s390x wasmtime job

2022-12-19 Thread Ilya Leoshkevich

On Fri, 2022-12-16 at 15:10 +, Alex Bennée wrote:
> 
> Ilya Leoshkevich  writes:
> 
> > On Tue, 2022-07-05 at 15:40 +0100, Peter Maydell wrote:
> > > On Tue, 5 Jul 2022 at 15:37, Ilya Leoshkevich 
> > > wrote:
> > > > 
> > > > On Tue, 2022-07-05 at 14:57 +0100, Peter Maydell wrote:
> > > > > On Tue, 5 Jul 2022 at 14:04, Daniel P. Berrangé
> > > > > 
> > > > > wrote:
> > > > > > If we put this job in QEMU CI someone will have to be able
> > > > > > to
> > > > > > interpret the results when it fails.
> > > > > 
> > > > > In particular since this is qemu-user, the answer is probably
> > > > > at least some of the time going to be "oh, well, qemu-user
> > > > > isn't
> > > > > reliable
> > > > > if you do complicated things in the guest". I'd be pretty
> > > > > wary of
> > > > > our
> > > > > having
> > > > > a "pass a big complicated guest code test suite under linux-
> > > > > user
> > > > > mode"
> > > > > in the CI path.
> > > 
> > > > Actually exercising qemu-user is one of the goals here: just as
> > > > an
> > > > example, one of the things that the test suite found was commit
> > > > 9a12adc704f9 ("linux-user/s390x: Fix unwinding from signal
> > > > handlers"),
> > > > so it's not only about the ISA.
> > > > 
> > > > At least for s390x, we've noticed that various projects use
> > > > qemu-user-based setups in their CI (either calling it
> > > > explicitly,
> > > > or
> > > > via binfmt-misc), and we would like these workflows to be
> > > > reliable,
> > > > even if they try complicated (within reason) things there.
> > > 
> > > I also would like them to be reliable. But I don't think
> > > *testing* these things is the difficulty: it is having
> > > people who are willing to spend time on the often quite
> > > difficult tasks of identifying why something intermittently
> > > fails and doing the necessary design and implementation work
> > > to correct the problem. Sometimes this is easy (as in the
> > > s390 regression above) but quite often it is not (eg when
> > > multiple threads are in use, or the guest wants to do
> > > something complicated with clone(), etc).
> > > 
> > > thanks
> > > -- PMM
> > > 
> > 
> > For what it's worth, we can help analyzing and fixing failures
> > detected
> > by the s390x wasmtime job. If something breaks, we will have to
> > look at
> > it anyway, and it's better to do this sooner than later.
> 
> Sorry for necroing an old thread but I just wanted to add my 2p.

Thanks for that though; I've been cherry-picking this patch into my
private trees for some time now, and would be happy to see it go
upstream in some form.

> I think making 3rd party test suites easily available to developers
> is a worthy
> goal and there are a number that I would like to see including LTP
> and
> kvm-unit-tests. As others have pointed out I'm less sure about adding
> it
> to the gating CI.

Another third-party test suite that I found useful was the valgrind's
one. I'll post my thoughts about integrating wasmtime's and valgrind's
test suites below, unfortunately I'm not too familiar with LTP and
kvm-unit-tests.

Not touching the gating CI is fine for me.

> If we want to go forward with this we should probably think about how
> we
> would approach this generally:
> 
>   - tests/third-party-suites/FOO?

Sounds good to me.

>   - should we use avocado as a wrapper or something else?
>     - make check-?

avocado sounds good; we might have to add a second wrapper for
producing tap output (see below).

One should definitely be able to specify the testsuite and the
architecture, e.g. `make check-third-party-wasmtime-s390x`.

In addition, we need to either hardcode or let the user choose
the way the testsuite it built and executed. I see 3 possibilities:

- Fully on the host. Easiest to implement, the results are also easy
  to debug. But this requires installing cross-toolchains manually,
  which is simple on some distros and not-so-simple on the others.

- Provide the toolchain as a Docker image. For wasmtime, the toolchain
  would include the Rust compiler and Cargo. This solves the problem
  with configuring the host, but introduces the next choice one has to
  make:

  - Build qemu on the host. Then qemu binary would have to be

Re: [PATCH v2 27/27] target/s390x: Enable TARGET_TB_PCREL

2022-12-14 Thread Ilya Leoshkevich

On Sun, Dec 11, 2022 at 09:28:02AM -0600, Richard Henderson wrote:
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/cpu-param.h |  1 +
>  target/s390x/cpu.c   | 12 +
>  target/s390x/tcg/translate.c | 86 +++-
>  3 files changed, 68 insertions(+), 31 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v2] linux-user: Add emulation for MADV_WIPEONFORK and MADV_KEEPONFORK in madvise()

2022-12-13 Thread Ilya Leoshkevich

On Tue, Dec 13, 2022 at 06:03:09PM +0100, Helge Deller wrote:
> Both parameters have a different value on the parisc platform, so first
> translate the target value into a host value for usage in the native
> madvise() syscall.
> 
> Those parameters are often used by security sensitive applications (e.g.
> tor browser, boringssl, ...) which expect the call to return a proper
> return code on failure, so return -EINVAL if qemu fails to forward the
> syscall to the host OS.
> 
> While touching this code, enhance the comments about MADV_DONTNEED.
> 
> Tested with testcase of tor browser when running hppa-linux guest on
> x86-64 host.
> 
> Signed-off-by: Helge Deller 
> 
> ---
> v2: based on feedback from Ilya Leoshkevich 
> - rename can_passthrough_madv_dontneed() to can_passthrough_madvise()
> - rephrase the comment about MADV_DONTNEED

Thanks!

Acked-by: Ilya Leoshkevich

Re: [PATCH v2 19/27] target/s390x: Introduce help_goto_indirect

2022-12-13 Thread Ilya Leoshkevich

On Sun, Dec 11, 2022 at 09:27:54AM -0600, Richard Henderson wrote:
> Add a small helper to handle unconditional indirect jumps.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c | 19 ++-
>  1 file changed, 10 insertions(+), 9 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v2 14/27] target/s390x: Assert masking of psw.addr in cpu_get_tb_cpu_state

2022-12-13 Thread Ilya Leoshkevich

On Sun, Dec 11, 2022 at 09:27:49AM -0600, Richard Henderson wrote:
> When changing modes via SAM, we raise a specification exception if the
> new PC is out of range.  The masking in s390x_tr_init_disas_context
> was too late to be correct, but may be removed.  Add a debugging
> assert in cpu_get_tb_cpu_state.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/cpu.h   | 20 ++--
>  target/s390x/tcg/translate.c |  6 +-
>  2 files changed, 15 insertions(+), 11 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v4 25/27] tcg/s390x: Tighten constraints for 64-bit compare

2022-12-13 Thread Ilya Leoshkevich

On Tue, Dec 13, 2022 at 10:43:07AM -0600, Richard Henderson wrote:
> On 12/13/22 10:25, Ilya Leoshkevich wrote:
> > On Thu, Dec 08, 2022 at 08:05:28PM -0600, Richard Henderson wrote:
> > > Give 64-bit comparison second operand a signed 33-bit immediate.
> > > This is the smallest superset of uint32_t and int32_t, as used
> > > by CLGFI and CGFI respectively.  The rest of the 33-bit space
> > > can be loaded into TCG_TMP0.  Drop use of the constant pool.
> > > 
> > > Signed-off-by: Richard Henderson 
> > > ---
> > >   tcg/s390x/tcg-target-con-set.h |  3 +++
> > >   tcg/s390x/tcg-target.c.inc | 27 ++-
> > >   2 files changed, 17 insertions(+), 13 deletions(-)
> > 
> > <...>
> > > --- a/tcg/s390x/tcg-target.c.inc
> > > +++ b/tcg/s390x/tcg-target.c.inc
> > > @@ -1249,22 +1249,20 @@ static int tgen_cmp2(TCGContext *s, TCGType type, 
> > > TCGCond c, TCGReg r1,
> > >   tcg_out_insn_RIL(s, op, r1, c2);
> > >   goto exit;
> > >   }
> > > +
> > > +/*
> > > + * Constraints are for a signed 33-bit operand, which is a
> > > + * convenient superset of this signed/unsigned test.
> > > + */
> > >   if (c2 == (is_unsigned ? (TCGArg)(uint32_t)c2 : 
> > > (TCGArg)(int32_t)c2)) {
> > >   op = (is_unsigned ? RIL_CLGFI : RIL_CGFI);
> > >   tcg_out_insn_RIL(s, op, r1, c2);
> > >   goto exit;
> > >   }
> > > -/* Use the constant pool, but not for small constants.  */
> > > -if (maybe_out_small_movi(s, type, TCG_TMP0, c2)) {
> > > -c2 = TCG_TMP0;
> > > -/* fall through to reg-reg */
> > > -} else {
> > > -op = (is_unsigned ? RIL_CLGRL : RIL_CGRL);
> > > -tcg_out_insn_RIL(s, op, r1, 0);
> > > -new_pool_label(s, c2, R_390_PC32DBL, s->code_ptr - 2, 2);
> > > -goto exit;
> > > -}
> > > +/* Load everything else into a register. */
> > > +tcg_out_movi(s, TCG_TYPE_I64, TCG_TMP0, c2);
> > > +c2 = TCG_TMP0;
> > 
> > What does tightening the constraint give us, if we have to handle the
> > "everything else" case anyway, even for values that match
> > TCG_CT_CONST_S33?
> 
> Values outside const_s33 get loaded by the register allocator, which means
> the value in the register might get re-used.

Thanks for the explanation!
I did not consider the reuse of already loaded large 64-bit values.

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v4 00/27] tcg/s390x: misc patches

2022-12-13 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:03PM -0600, Richard Henderson wrote:
> Based-on: 20221202053958.223890-1-richard.hender...@linaro.org
> ("[PATCH for-8.0 v3 00/34] tcg misc patches")
> 
> Changes from v3:
>   * Require z196 as minimum cpu -- 6 new patches removing checks.
>   * Tighten constraints on AND, OR, XOR, CMP, trying get the register
> allocator to hoist things that can't be done in a single insn.
>   * Avoid the constant pool for movi.
> 
> I believe that I have addressed all of the discussion in v3,
> except perhaps for goto_tb concurrent modifications to jumps.
> I'm still not quite sure what to do about that.

I asked around, and apparently some other JITs (e.g. Java and .NET) are
doing atomic branch offset patching (provided the offset is aligned,
which QEMU does already ensure) on s390x for a long time now, and
without known issues. So I'm okay with keeping this code as is.

<...>

Best regards,
Ilya

Re: [PATCH v4 27/27] tcg/s390x: Avoid the constant pool in tcg_out_movi

2022-12-13 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:30PM -0600, Richard Henderson wrote:
> Load constants in no more than two insns, which turns
> out to be faster than using the constant pool.
> 
> Suggested-by: Ilya Leoshkevich 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.c.inc | 23 +--
>  1 file changed, 17 insertions(+), 6 deletions(-)

Thanks!

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v4 26/27] tcg/s390x: Cleanup tcg_out_movi

2022-12-13 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:29PM -0600, Richard Henderson wrote:
> Merge maybe_out_small_movi, as it no longer has additional users.
> Use is_const_p{16,32}.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.c.inc | 52 --
>  1 file changed, 16 insertions(+), 36 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v4 25/27] tcg/s390x: Tighten constraints for 64-bit compare

2022-12-13 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:28PM -0600, Richard Henderson wrote:
> Give 64-bit comparison second operand a signed 33-bit immediate.
> This is the smallest superset of uint32_t and int32_t, as used
> by CLGFI and CGFI respectively.  The rest of the 33-bit space
> can be loaded into TCG_TMP0.  Drop use of the constant pool.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target-con-set.h |  3 +++
>  tcg/s390x/tcg-target.c.inc | 27 ++-
>  2 files changed, 17 insertions(+), 13 deletions(-)

<...>
 
> --- a/tcg/s390x/tcg-target.c.inc
> +++ b/tcg/s390x/tcg-target.c.inc
> @@ -1249,22 +1249,20 @@ static int tgen_cmp2(TCGContext *s, TCGType type, 
> TCGCond c, TCGReg r1,
>  tcg_out_insn_RIL(s, op, r1, c2);
>  goto exit;
>  }
> +
> +/*
> + * Constraints are for a signed 33-bit operand, which is a
> + * convenient superset of this signed/unsigned test.
> + */
>  if (c2 == (is_unsigned ? (TCGArg)(uint32_t)c2 : 
> (TCGArg)(int32_t)c2)) {
>  op = (is_unsigned ? RIL_CLGFI : RIL_CGFI);
>  tcg_out_insn_RIL(s, op, r1, c2);
>  goto exit;
>  }
>  
> -/* Use the constant pool, but not for small constants.  */
> -if (maybe_out_small_movi(s, type, TCG_TMP0, c2)) {
> -c2 = TCG_TMP0;
> -/* fall through to reg-reg */
> -} else {
> -op = (is_unsigned ? RIL_CLGRL : RIL_CGRL);
> -tcg_out_insn_RIL(s, op, r1, 0);
> -new_pool_label(s, c2, R_390_PC32DBL, s->code_ptr - 2, 2);
> -goto exit;
> -}
> +/* Load everything else into a register. */
> +tcg_out_movi(s, TCG_TYPE_I64, TCG_TMP0, c2);
> +c2 = TCG_TMP0;

What does tightening the constraint give us, if we have to handle the
"everything else" case anyway, even for values that match
TCG_CT_CONST_S33?

The example that I have in mind is:

- Comparison: r0_64 s<= -0xL;
- tcg_target_const_match(-0xL, TCG_TYPE_I64,
 TCG_CT_CONST_S33) == true;
- (long)(int)-0xL != -0x;
- So we should end up in the "everything else" branch.

<...>

Best regards,
Ilya

Re: [PATCH v4 18/27] tcg/s390x: Tighten constraints for and_i64

2022-12-12 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:21PM -0600, Richard Henderson wrote:
> Let the register allocator handle such immediates by matching
> only what one insn can achieve.
> 
> Signed-off-by: Richard Henderson 

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v4 17/27] tcg/s390x: Tighten constraints for or_i64 and xor_i64

2022-12-12 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:20PM -0600, Richard Henderson wrote:
> Drop support for sequential OR and XOR, as the serial dependency is
> slower than loading the constant first.  Let the register allocator
> handle such immediates by matching only what one insn can achieve.
> 
> Signed-off-by: Richard Henderson 

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v4 16/27] tcg/s390x: Issue XILF directly for xor_i32

2022-12-12 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:19PM -0600, Richard Henderson wrote:
> There is only one instruction that is applicable
> to a 32-bit immediate xor.
> 
> Signed-off-by: Richard Henderson 

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v4 10/27] tcg/s390x: Remove DISTINCT_OPERANDS facility check

2022-12-12 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:13PM -0600, Richard Henderson wrote:
> The distinct-operands facility is bundled into facility 45,
> along with load-on-condition.  We are checking this at startup.
> Remove the a0 == a1 checks for 64-bit sub, and, or, xor, as there
> is no space savings for avoiding the distinct-operands insn.
> 
> Signed-off-by: Richard Henderson 

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v4 08/27] tcg/s390x: Check for load-on-condition facility at startup

2022-12-12 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:11PM -0600, Richard Henderson wrote:
> The general-instruction-extension facility was introduced in z196,
> which itself was end-of-life in 2021.  In addition, z196 is the
> minimum CPU supported by our set of supported operating systems:
> RHEL 7 (z196), SLES 12 (z196) and Ubuntu 16.04 (zEC12).
> 
> Check for facility number 45, which will be the consilidated check
> for several facilities.
> 
> Signed-off-by: Richard Henderson 

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v4 07/27] tcg/s390x: Check for general-instruction-extension facility at startup

2022-12-12 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:10PM -0600, Richard Henderson wrote:
> The general-instruction-extension facility was introduced in z10,
> which itself was end-of-life in 2019.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.h |  10 ++--
>  tcg/s390x/tcg-target.c.inc | 100 -
>  2 files changed, 49 insertions(+), 61 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v4 06/27] tcg/s390x: Check for extended-immediate facility at startup

2022-12-12 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:09PM -0600, Richard Henderson wrote:
> The extended-immediate facility was introduced in z9-109,
> which itself was end-of-life in 2017.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.h |   4 +-
>  tcg/s390x/tcg-target.c.inc | 231 +++--
>  2 files changed, 72 insertions(+), 163 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH] linux-user: Add emulation for MADV_WIPEONFORK and MADV_KEEPONFORK in madvise()

2022-12-12 Thread Ilya Leoshkevich

On Mon, Dec 12, 2022 at 10:49:24PM +0100, Helge Deller wrote:
> On 12/12/22 22:16, Ilya Leoshkevich wrote:
> > On Mon, Dec 12, 2022 at 08:00:45AM +0100, Helge Deller wrote:
> > > Both parameters have a different value on the parisc platform, so first
> > > translate the target value into a host value for usage in the native
> > > madvise() syscall.
> > > 
> > > Those parameters are often used by security sensitive applications (e.g.
> > > tor browser, boringssl, ...) which expect the call to return a proper
> > > return code on failure, so return -EINVAL if qemu fails to forward the
> > > syscall to the host OS.
> > > 
> > > Tested with testcase of tor browser when running hppa-linux guest on
> > > x86-64 host.
> > > 
> > > Signed-off-by: Helge Deller 
> > > 
> > > diff --git a/linux-user/mmap.c b/linux-user/mmap.c
> > > index 10f5079331..c75342108c 100644
> > > --- a/linux-user/mmap.c
> > > +++ b/linux-user/mmap.c
> > > @@ -901,11 +901,25 @@ abi_long target_madvise(abi_ulong start, abi_ulong 
> > > len_in, int advice)
> > >   return -TARGET_EINVAL;
> > >   }
> > > 
> > > +/* Translate for some architectures which have different MADV_xxx 
> > > values */
> > > +switch (advice) {
> > > +case TARGET_MADV_DONTNEED:  /* alpha */
> > > +advice = MADV_DONTNEED;
> > > +break;
> > > +case TARGET_MADV_WIPEONFORK:/* parisc */
> > > +advice = MADV_WIPEONFORK;
> > > +break;
> > > +case TARGET_MADV_KEEPONFORK:/* parisc */
> > > +advice = MADV_KEEPONFORK;
> > > +break;
> > > +/* we do not care about the other MADV_xxx values yet */
> > > +}
> > > +
> > >   /*
> > >* A straight passthrough may not be safe because qemu sometimes 
> > > turns
> > >* private file-backed mappings into anonymous mappings.
> > >*
> > > - * This is a hint, so ignoring and returning success is ok.
> > > + * For MADV_DONTNEED, which is a hint, ignoring and returning 
> > > success is ok.
> > 
> > Actually, MADV_DONTNEED is one of the few values, which is not always a
> > hint - it can be used to e.g. zero out pages.
> 
> Right, it _should_ zero out pages and return 0, or otherwise return failure.
> I think the problem is that some userspace apps will then sadly break if we
> change the current behaviour
> 
> Anyway, in this patch I didn't wanted to touch MAD_DONTNEED.
> 
> > As the next paragraph states, strictly speaking, MADV_DONTNEED is
> > currently broken, because it can indeed be ignored without indication
> > in some cases, but it's still arguably better than not honoring it at
> > all.
> 
> Yep.
> 
> > >*
> > >* This breaks MADV_DONTNEED, completely implementing which is quite
> > >* complicated. However, there is one low-hanging fruit: mappings 
> > > that are
> > > @@ -913,11 +927,17 @@ abi_long target_madvise(abi_ulong start, abi_ulong 
> > > len_in, int advice)
> > >* passthrough is safe, so do it.
> > >*/
> > >   mmap_lock();
> > > -if (advice == TARGET_MADV_DONTNEED &&
> > > -can_passthrough_madv_dontneed(start, end)) {
> > > -ret = get_errno(madvise(g2h_untagged(start), len, 
> > > MADV_DONTNEED));
> > > -if (ret == 0) {
> > > -page_reset_target_data(start, start + len);
> > > +switch (advice) {
> > > +case MADV_WIPEONFORK:
> > > +case MADV_KEEPONFORK:
> > > +ret = -EINVAL;
> > > +/* fall through */
> > > +case MADV_DONTNEED:
> > > +if (can_passthrough_madv_dontneed(start, end)) {
> > > +ret = get_errno(madvise(g2h_untagged(start), len, advice));
> > > +if ((advice == MADV_DONTNEED) && (ret == 0)) {
> > > +page_reset_target_data(start, start + len);
> > > +}
> > >   }
> > >   }
> > >   mmap_unlock();
> > > 
> > 
> > Nit: maybe rename can_passthrough_madv_dontneed() to can_passthrough(),
> > since now it's used not only for MADV_DONTNEED?
> 
> Maybe can_passthrough_madvise() is better?

Sounds good to me as well. The idea with PAGE_PASSTHROUGH was that we
should be able to passthrough anything; but with this patch we still
use it only for madvise(), and indicating it in the name makes sense.

> > > With the MADV_DONTNEED comment change:
> 
> Just for me to understand correctly:
> You propose that I shouldn't touch that comment in my followup-patch, right?
> That's ok.

Either that, or maybe make it more precise - strictly speaking, it's
not correct at the moment anyway. Maybe something like this?

Most advice values are hints, so ignoring and returning success is
ok.

However, this would break MADV_DONTNEED, MADV_WIPEONFORK and
MADV_KEEPONFORK ...

> > Acked-by: Ilya Leoshkevich 
> 
> Thanks!
> Helge

Re: [PATCH v4 05/27] tcg/s390x: Check for long-displacement facility at startup

2022-12-12 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:08PM -0600, Richard Henderson wrote:
> We are already assuming the existance of long-displacement, but were
> not being explicit about it.  This has been present since z990.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.h |  6 --
>  tcg/s390x/tcg-target.c.inc | 15 +++
>  2 files changed, 19 insertions(+), 2 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v4 04/27] tcg/s390x: Remove USE_LONG_BRANCHES

2022-12-12 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:07PM -0600, Richard Henderson wrote:
> The size of a compiled TB is limited by the uint16_t used by
> gen_insn_end_off[] -- there is no need for a 32-bit branch.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.c.inc | 9 -
>  1 file changed, 9 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v4 03/27] tcg/s390x: Always set TCG_TARGET_HAS_direct_jump

2022-12-12 Thread Ilya Leoshkevich

On Thu, Dec 08, 2022 at 08:05:06PM -0600, Richard Henderson wrote:
> Since USE_REG_TB is removed, there is no need to load the
> target TB address into a register.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.h |  2 +-
>  tcg/s390x/tcg-target.c.inc | 48 +++---
>  2 files changed, 10 insertions(+), 40 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH] linux-user: Add emulation for MADV_WIPEONFORK and MADV_KEEPONFORK in madvise()

2022-12-12 Thread Ilya Leoshkevich

On Mon, Dec 12, 2022 at 08:00:45AM +0100, Helge Deller wrote:
> Both parameters have a different value on the parisc platform, so first
> translate the target value into a host value for usage in the native
> madvise() syscall.
> 
> Those parameters are often used by security sensitive applications (e.g.
> tor browser, boringssl, ...) which expect the call to return a proper
> return code on failure, so return -EINVAL if qemu fails to forward the
> syscall to the host OS.
> 
> Tested with testcase of tor browser when running hppa-linux guest on
> x86-64 host.
> 
> Signed-off-by: Helge Deller 
> 
> diff --git a/linux-user/mmap.c b/linux-user/mmap.c
> index 10f5079331..c75342108c 100644
> --- a/linux-user/mmap.c
> +++ b/linux-user/mmap.c
> @@ -901,11 +901,25 @@ abi_long target_madvise(abi_ulong start, abi_ulong 
> len_in, int advice)
>  return -TARGET_EINVAL;
>  }
> 
> +/* Translate for some architectures which have different MADV_xxx values 
> */
> +switch (advice) {
> +case TARGET_MADV_DONTNEED:  /* alpha */
> +advice = MADV_DONTNEED;
> +break;
> +case TARGET_MADV_WIPEONFORK:/* parisc */
> +advice = MADV_WIPEONFORK;
> +break;
> +case TARGET_MADV_KEEPONFORK:/* parisc */
> +advice = MADV_KEEPONFORK;
> +break;
> +/* we do not care about the other MADV_xxx values yet */
> +}
> +
>  /*
>   * A straight passthrough may not be safe because qemu sometimes turns
>   * private file-backed mappings into anonymous mappings.
>   *
> - * This is a hint, so ignoring and returning success is ok.
> + * For MADV_DONTNEED, which is a hint, ignoring and returning success is 
> ok.

Actually, MADV_DONTNEED is one of the few values, which is not always a
hint - it can be used to e.g. zero out pages.

As the next paragraph states, strictly speaking, MADV_DONTNEED is
currently broken, because it can indeed be ignored without indication
in some cases, but it's still arguably better than not honoring it at
all.

>   *
>   * This breaks MADV_DONTNEED, completely implementing which is quite
>   * complicated. However, there is one low-hanging fruit: mappings that 
> are
> @@ -913,11 +927,17 @@ abi_long target_madvise(abi_ulong start, abi_ulong 
> len_in, int advice)
>   * passthrough is safe, so do it.
>   */
>  mmap_lock();
> -if (advice == TARGET_MADV_DONTNEED &&
> -can_passthrough_madv_dontneed(start, end)) {
> -ret = get_errno(madvise(g2h_untagged(start), len, MADV_DONTNEED));
> -if (ret == 0) {
> -page_reset_target_data(start, start + len);
> +switch (advice) {
> +case MADV_WIPEONFORK:
> +case MADV_KEEPONFORK:
> +ret = -EINVAL;
> +/* fall through */
> +case MADV_DONTNEED:
> +if (can_passthrough_madv_dontneed(start, end)) {
> +ret = get_errno(madvise(g2h_untagged(start), len, advice));
> +if ((advice == MADV_DONTNEED) && (ret == 0)) {
> +page_reset_target_data(start, start + len);
> +    }
>      }
>  }
>  mmap_unlock();
> 

Nit: maybe rename can_passthrough_madv_dontneed() to can_passthrough(),
since now it's used not only for MADV_DONTNEED?



With the MADV_DONTNEED comment change:

Acked-by: Ilya Leoshkevich

Re: [PATCH v3 02/13] tcg/s390x: Remove TCG_REG_TB

2022-12-08 Thread Ilya Leoshkevich

On Tue, 2022-12-06 at 18:42 -0600, Richard Henderson wrote:
> On 12/6/22 16:22, Richard Henderson wrote:
> > > Wouldn't it be worth keeping XILF/XIFH here?
> > 
> > I don't know.  It's difficult for me to guess whether a dependency
> > chain like
> > 
> >  val -> xor -> xor
> > 
> > (3 insns with serial dependencies) is better than
> > 
> >  val   --> xor
> >  load  -/
> > 
> > (3 insns, but only one serial dependency) is better.  But there may
> > also be instruction 
> > fusion going on at the micro-architectural level, so that there's
> > really only one xor.
> > 
> > If you have suggestions, I'm all ears.
> 
> Related microarchitectural question:
> 
> If a 32-bit insn and a 64-bit insn have a similar size encoding (and
> perhaps even if they 
> don't), is it better to produce a 64-bit output so that the hw
> doesn't have a false 
> dependency on the upper 32-bits of the register?
> 
> Just wondering whether most of the distinction between 32-bit and 64-
> bit opcodes ought to 
> be discarded, simplifying code generation.  The only items that seem
> most likely to have 
> real execution time differences are multiply and divide.

I think this will definitely lead to false dependencies if one produces
32 bits in one place, and then consumes 64 in the other. But if this
idea is applied consistently, then there should be hopefully not so
many such instances?

Two thing come to mind here: memory and CC generation.

The first is probably not very important: we can implement 32-bit loads
with lgf, which sign-extends to 64 bits.

CC generation can be tricker: for conditional jumps it's still going to
be okay, since the code would produce 64-bit values and consume 32-bit
ones, but if the back-end needs a CC from a 32-bit addition, then we
would probably need to sign-extend the result in order to get rid of a
false dependency later on. However, based on a quick inspection I could
not find any such instances.

So using 64-bit operations instead of 32-bit ones would be an
interesting experiment.

Best regards,
Ilya

Re: [PATCH v3 02/13] tcg/s390x: Remove TCG_REG_TB

2022-12-07 Thread Ilya Leoshkevich

On Tue, 2022-12-06 at 16:22 -0600, Richard Henderson wrote:
> On 12/6/22 13:29, Ilya Leoshkevich wrote:
> > On Thu, Dec 01, 2022 at 10:51:49PM -0800, Richard Henderson wrote:
> > > This reverts 829e1376d940 ("tcg/s390: Introduce TCG_REG_TB"), and
> > > several follow-up patches.  The primary motivation is to reduce
> > > the
> > > less-tested code paths, pre-z10.  Secondarily, this allows the
> > > unconditional use of TCG_TARGET_HAS_direct_jump, which might be
> > > more
> > > important for performance than any slight increase in code size.
> > > 
> > > Signed-off-by: Richard Henderson 
> > > ---
> > >   tcg/s390x/tcg-target.h |   2 +-
> > >   tcg/s390x/tcg-target.c.inc | 176 +-----
> > > ---
> > >   2 files changed, 23 insertions(+), 155 deletions(-)
> > 
> > Reviewed-by: Ilya Leoshkevich 
> > 
> > I have a few questions/ideas for the future below.  
> > > 
...

> > 
> > > -    /* Use the constant pool if USE_REG_TB, but not for small
> > > constants.  */
> > > -    if (maybe_out_small_movi(s, type, TCG_TMP0, val)) {
> > > -    if (type == TCG_TYPE_I32) {
> > > -    tcg_out_insn(s, RR, XR, dest, TCG_TMP0);
> > > -    } else {
> > > -    tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0);
> > > -    }
> > > -    } else if (USE_REG_TB) {
> > > -    tcg_out_insn(s, RXY, XG, dest, TCG_REG_TB, TCG_REG_NONE,
> > > 0);
> > > -    new_pool_label(s, val, R_390_20, s->code_ptr - 2,
> > > -   tcg_tbrel_diff(s, NULL));
> > > +    tcg_out_movi(s, type, TCG_TMP0, val);
> > > +    if (type == TCG_TYPE_I32) {
> > > +    tcg_out_insn(s, RR, XR, dest, TCG_TMP0);
> > >   } else {
> > > -    /* Perform the xor by parts.  */
> > > -    tcg_debug_assert(HAVE_FACILITY(EXT_IMM));
> > > -    if (val & 0x) {
> > > -    tcg_out_insn(s, RIL, XILF, dest, val);
> > > -    }
> > > -    if (val > 0x) {
> > > -    tcg_out_insn(s, RIL, XIHF, dest, val >> 32);
> > > -    }
> > > +    tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0);
> > >   }
> > >   }
> > 
> > Wouldn't it be worth keeping XILF/XIFH here?
> 
> I don't know.  It's difficult for me to guess whether a dependency
> chain like
> 
>  val -> xor -> xor
> 
> (3 insns with serial dependencies) is better than
> 
>  val   --> xor
>  load  -/
> 
> (3 insns, but only one serial dependency) is better.  But there may
> also be instruction 
> fusion going on at the micro-architectural level, so that there's
> really only one xor.
> 
> If you have suggestions, I'm all ears.

I ran some experiments, and it turned out you were right: xilf+xihf is
slower exactly because of dependency chains.
So no need to change anything here and sorry for the noise.

> > I don't have any numbers right now, but it looks more
> > compact/efficient
> > than a load + XGR.
> 
> If we assume general-instruction-extension-facility (z10?), LGRL +
> XGR is smaller than 
> XILF + XIFH, ignoring the constant pool entry which might be shared,
> and modulo the µarch 
> questions above.
> 
> 
> > Same for OGR above; I even wonder if both implementations could be
> > unified.
> 
> Sadly not, because of OILL et al.  There are no 16-bit xor immediate
> insns.
> 
> > > +    /*
> > > + * branch displacement must be aligned for atomic
> > > patching;
> > > + * see if we need to add extra nop before branch
> > > + */
> > > +    if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
> > > +    tcg_out16(s, NOP);
> > >   }
> > > +    tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
> > > +    s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
> > > +    tcg_out32(s, 0);
> > >   set_jmp_reset_offset(s, a0);
> > 
> > This seems to work in practice, but I don't think patching branch
> > offsets is allowed by PoP (in a multi-threaded environment). For
> > example, I had to do [2] in order to work around this limitation in
> > ftrace.
> 
> Really?  How does the processor distinguish between overwriting
> opcode/condition vs 
> overwriting immediate operand when invalidating cached instructions?

The problem is that nothing in PoP prevents a CPU from fetching
instruction bytes one by one, in random order and more than one time.
It's not that this is necessarily going to happen, rather, it's just
that this has never been verified for all instructions and formally
stated. The observation that I use in the ftrace patch is not
formalized either, but it's specific to a single instruction and should
hold in practice for the existing hardware to the best of my knowledge.

> If overwriting operand truly isn't correct, then I think we have to
> use indirect branch 
> always for goto_tb.
> 
> > A third benefit seems to be that we now have one more register to
> > allocate.
> 
> Yes.  It's call clobbered, so it isn't live so often, but sometimes.
> 
> 
> r~

Re: [PATCH v3 02/13] tcg/s390x: Remove TCG_REG_TB

2022-12-07 Thread Ilya Leoshkevich

On Wed, 2022-12-07 at 08:55 -0600, Richard Henderson wrote:
> On 12/7/22 01:45, Thomas Huth wrote:
> > On 06/12/2022 23.22, Richard Henderson wrote:
> > > On 12/6/22 13:29, Ilya Leoshkevich wrote:
> > > > This change doesn't seem to affect that, but what is the
> > > > minimum
> > > > supported s390x qemu host? z900?
> > > 
> > > Possibly z990, if I'm reading the gcc processor_flags_table[]
> > > correctly; 
> > > long-displacement-facility is definitely a minimum.
> > > 
> > > We probably should revisit what the minimum for TCG should be,
> > > assert those features at 
> > > startup, and drop the corresponding runtime tests.
> > 
> > If we consider the official IBM support statement:
> > 
> > https://www.ibm.com/support/pages/system/files/inline-files/IBM%20Mainframe%20Life%20Cycle%20History%20V2.10%20-%20Sept%2013%202022_1.pdf
> > 
> > ... that would mean that the z10 and all older machines are not
> > supported anymore.
> 
> Thanks for the pointer.  It would appear that z114 exits support at
> the end of this month, 
> which would leave z12 as minimum supported cpu.
> 
> Even assuming z196 gets us extended-immediate, general-insn-
> extension, load-on-condition, 
> and distinct-operands, which are all quite important to TCG, and
> constitute almost all of 
> the current runtime checks.
> 
> The other metric would be matching the set of supported cpus from the
> set of supported os 
> distributions, but I would be ready to believe z196 is below the
> minimum there too.
> 
> 
> r~

I think it should be safe to raise the minimum required hardware for
TCG to z196:

* The oldest supported RHEL is v7, it requires z196:

https://access.redhat.com/product-life-cycles/
https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/installation_guide/chap-installation-planning-s390

* The oldest supported SLES is v12, it requires z196:

https://www.suse.com/de-de/lifecycle/
https://documentation.suse.com/sles/12-SP5/html/SLES-all/cha-zseries.html

* The oldest supported Ubuntu is v16.04, it requires zEC12+:
https://wiki.ubuntu.com/S390X

Best regards,
Ilya

Re: [PATCH v3 13/13] tcg/s390x: Implement ctpop operation

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 10:52:00PM -0800, Richard Henderson wrote:
> There is an older form that produces per-byte results,
> and a newer form that produces per-register results,
> and a vector form that produces per-element results.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.h |  5 ++--
>  tcg/s390x/tcg-target.c.inc | 51 ++
>  2 files changed, 54 insertions(+), 2 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v3 12/13] tcg/s390x: Use tgen_movcond_int in tgen_clz

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 10:51:59PM -0800, Richard Henderson wrote:
> Reuse code from movcond to conditionally copy a2 to dest,
> based on the condition codes produced by FLOGR.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target-con-set.h |  1 +
>  tcg/s390x/tcg-target.c.inc | 26 +++---
>  2 files changed, 12 insertions(+), 15 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v3 11/13] tcg/s390x: Support SELGR instruction in movcond

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 10:51:58PM -0800, Richard Henderson wrote:
> The new select instruction provides two separate register inputs,
> whereas the old load-on-condition instruction overlaps one of the
> register inputs with the destination.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.c.inc | 21 +
>  1 file changed, 21 insertions(+)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v3 10/13] tcg/s390x: Generalize movcond implementation

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 10:51:57PM -0800, Richard Henderson wrote:
> Generalize movcond to support pre-computed conditions, and the same
> set of arguments at all times.  This will be assumed by a following
> patch, which needs to reuse tgen_movcond_int.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target-con-set.h |  3 +-
>  tcg/s390x/tcg-target.c.inc | 78 ++
>  2 files changed, 61 insertions(+), 20 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v3 09/13] tcg/s390x: Create tgen_cmp2 to simplify movcond

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 10:51:56PM -0800, Richard Henderson wrote:
> Return both regular and inverted condition codes from tgen_cmp2.
> This lets us choose after the fact which comparision we want.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.c.inc | 25 +
>  1 file changed, 17 insertions(+), 8 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v3 08/13] tcg/s390x: Support MIE3 logical operations

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 10:51:55PM -0800, Richard Henderson wrote:
> This is andc, orc, nand, nor, eqv.
> We can use nor for implementing not.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target-con-set.h |   1 +
>  tcg/s390x/tcg-target.h |  25 +
>  tcg/s390x/tcg-target.c.inc | 100 +
>  3 files changed, 114 insertions(+), 12 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v3 07/13] tcg/s390x: Support MIE2 MGRK instruction

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 10:51:54PM -0800, Richard Henderson wrote:
> The MIE2 facility adds a 3-operand signed 64x64->128 multiply.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target-con-set.h | 1 +
>  tcg/s390x/tcg-target.h | 2 +-
>  tcg/s390x/tcg-target.c.inc | 8 
>  3 files changed, 10 insertions(+), 1 deletion(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v3 06/13] tcg/s390x: Support MIE2 multiply single instructions

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 10:51:53PM -0800, Richard Henderson wrote:
> The MIE2 facility adds 3-operand versions of multiply.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target-con-set.h |  1 +
>  tcg/s390x/tcg-target.h |  1 +
>  tcg/s390x/tcg-target.c.inc | 34 --
>  3 files changed, 26 insertions(+), 10 deletions(-)

Reviewed-by: Ilya Leoshkevich 

I have one small suggestion, see below.

> diff --git a/tcg/s390x/tcg-target-con-set.h b/tcg/s390x/tcg-target-con-set.h
> index 00ba727b70..33a82e3286 100644
> --- a/tcg/s390x/tcg-target-con-set.h
> +++ b/tcg/s390x/tcg-target-con-set.h
> @@ -23,6 +23,7 @@ C_O1_I2(r, 0, ri)
>  C_O1_I2(r, 0, rI)
>  C_O1_I2(r, 0, rJ)
>  C_O1_I2(r, r, ri)
> +C_O1_I2(r, r, rJ)
>  C_O1_I2(r, rZ, r)
>  C_O1_I2(v, v, r)
>  C_O1_I2(v, v, v)
> diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
> index 645f522058..bfd623a639 100644
> --- a/tcg/s390x/tcg-target.h
> +++ b/tcg/s390x/tcg-target.h
> @@ -63,6 +63,7 @@ typedef enum TCGReg {
>  #define FACILITY_FAST_BCR_SER FACILITY_LOAD_ON_COND
>  #define FACILITY_DISTINCT_OPS FACILITY_LOAD_ON_COND
>  #define FACILITY_LOAD_ON_COND253
> +#define FACILITY_MISC_INSN_EXT2   58
>  #define FACILITY_VECTOR   129
>  #define FACILITY_VECTOR_ENH1  135
>  
> diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
> index d02b433271..cd39b2a208 100644
> --- a/tcg/s390x/tcg-target.c.inc
> +++ b/tcg/s390x/tcg-target.c.inc
> @@ -180,6 +180,8 @@ typedef enum S390Opcode {
>  RRE_SLBGR   = 0xb989,
>  RRE_XGR = 0xb982,
>  
> +RRFa_MSRKC  = 0xb9fd,
> +RRFa_MSGRKC = 0xb9ed,
>  RRFa_NRK= 0xb9f4,
>  RRFa_NGRK   = 0xb9e4,
>  RRFa_ORK= 0xb9f6,
> @@ -2140,14 +2142,18 @@ static inline void tcg_out_op(TCGContext *s, 
> TCGOpcode opc,
>  break;
>  
>  case INDEX_op_mul_i32:
> +a0 = args[0], a1 = args[1], a2 = (int32_t)args[2];
>  if (const_args[2]) {
> -if ((int32_t)args[2] == (int16_t)args[2]) {
> -tcg_out_insn(s, RI, MHI, args[0], args[2]);
> +tcg_out_mov(s, TCG_TYPE_I32, a0, a1);

Should we consider a0 == a1 case here as well, in order to get rid of
this extra move when possible?

> +if (a2 == (int16_t)a2) {
> +tcg_out_insn(s, RI, MHI, a0, a2);
>  } else {
> -tcg_out_insn(s, RIL, MSFI, args[0], args[2]);
> +tcg_out_insn(s, RIL, MSFI, a0, a2);
>  }
> +} else if (a0 == a1) {
> +tcg_out_insn(s, RRE, MSR, a0, a2);
>  } else {
> -tcg_out_insn(s, RRE, MSR, args[0], args[2]);
> +tcg_out_insn(s, RRFa, MSRKC, a0, a1, a2);
>  }
>  break;
>  
> @@ -2405,14 +2411,18 @@ static inline void tcg_out_op(TCGContext *s, 
> TCGOpcode opc,
>  break;
>  
>  case INDEX_op_mul_i64:
> +a0 = args[0], a1 = args[1], a2 = args[2];
>  if (const_args[2]) {
> -if (args[2] == (int16_t)args[2]) {
> -tcg_out_insn(s, RI, MGHI, args[0], args[2]);
> +tcg_out_mov(s, TCG_TYPE_I64, a0, a1);

Same here.

> +if (a2 == (int16_t)a2) {
> +tcg_out_insn(s, RI, MGHI, a0, a2);
>  } else {
> -tcg_out_insn(s, RIL, MSGFI, args[0], args[2]);
> +tcg_out_insn(s, RIL, MSGFI, a0, a2);
>  }
> +} else if (a0 == a1) {
> +tcg_out_insn(s, RRE, MSGR, a0, a2);
>  } else {
> -tcg_out_insn(s, RRE, MSGR, args[0], args[2]);
> +tcg_out_insn(s, RRFa, MSGRKC, a0, a1, a2);
>  }
>  break;
>  
> @@ -3072,12 +3082,16 @@ static TCGConstraintSetIndex 
> tcg_target_op_def(TCGOpcode op)
> MULTIPLY SINGLE IMMEDIATE with a signed 32-bit, otherwise we
> have only MULTIPLY HALFWORD IMMEDIATE, with a signed 16-bit.  */
>  return (HAVE_FACILITY(GEN_INST_EXT)
> -? C_O1_I2(r, 0, ri)
> +? (HAVE_FACILITY(MISC_INSN_EXT2)
> +   ? C_O1_I2(r, r, ri)
> +   : C_O1_I2(r, 0, ri))
>  : C_O1_I2(r, 0, rI));
>  
>  case INDEX_op_mul_i64:
>  return (HAVE_FACILITY(GEN_INST_EXT)
> -? C_O1_I2(r, 0, rJ)
> +? (HAVE_FACILITY(MISC_INSN_EXT2)
> +   ? C_O1_I2(r, r, rJ)
> +   : C_O1_I2(r, 0, rJ))
>  : C_O1_I2(r, 0, rI));
>  
>  case INDEX_op_shl_i32:
> -- 
> 2.34.1
> 
>

Re: [PATCH v3 05/13] tcg/s390x: Distinguish RIE formats

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 10:51:52PM -0800, Richard Henderson wrote:
> There are multiple variations, with different fields.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.c.inc | 47 +-
>  1 file changed, 26 insertions(+), 21 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v3 04/13] tcg/s390x: Distinguish RRF-a and RRF-c formats

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 10:51:51PM -0800, Richard Henderson wrote:
> One has 3 register arguments; the other has 2 plus an m3 field.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.c.inc | 57 +-
>  1 file changed, 32 insertions(+), 25 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v3 03/13] tcg/s390x: Use LARL+AGHI for odd addresses

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 10:51:50PM -0800, Richard Henderson wrote:
> Add one instead of dropping odd addresses to the constant pool.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.c.inc | 15 ---
>  1 file changed, 8 insertions(+), 7 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v3 02/13] tcg/s390x: Remove TCG_REG_TB

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 10:51:49PM -0800, Richard Henderson wrote:
> This reverts 829e1376d940 ("tcg/s390: Introduce TCG_REG_TB"), and
> several follow-up patches.  The primary motivation is to reduce the
> less-tested code paths, pre-z10.  Secondarily, this allows the
> unconditional use of TCG_TARGET_HAS_direct_jump, which might be more
> important for performance than any slight increase in code size.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target.h |   2 +-
>  tcg/s390x/tcg-target.c.inc | 176 +
>  2 files changed, 23 insertions(+), 155 deletions(-)

Reviewed-by: Ilya Leoshkevich 

I have a few questions/ideas for the future below.
 
> diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
> index 22d70d431b..645f522058 100644
> --- a/tcg/s390x/tcg-target.h
> +++ b/tcg/s390x/tcg-target.h
> @@ -103,7 +103,7 @@ extern uint64_t s390_facilities[3];
>  #define TCG_TARGET_HAS_mulsh_i32  0
>  #define TCG_TARGET_HAS_extrl_i64_i32  0
>  #define TCG_TARGET_HAS_extrh_i64_i32  0
> -#define TCG_TARGET_HAS_direct_jumpHAVE_FACILITY(GEN_INST_EXT)
> +#define TCG_TARGET_HAS_direct_jump1

This change doesn't seem to affect that, but what is the minimum
supported s390x qemu host? z900?

>  #define TCG_TARGET_HAS_qemu_st8_i32   0
>  
>  #define TCG_TARGET_HAS_div2_i64   1
> diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
> index cb00bb6999..8a4bec0a28 100644
> --- a/tcg/s390x/tcg-target.c.inc
> +++ b/tcg/s390x/tcg-target.c.inc
> @@ -65,12 +65,6 @@
>  /* A scratch register that may be be used throughout the backend.  */
>  #define TCG_TMP0TCG_REG_R1
>  
> -/* A scratch register that holds a pointer to the beginning of the TB.
> -   We don't need this when we have pc-relative loads with the general
> -   instructions extension facility.  */
> -#define TCG_REG_TB  TCG_REG_R12
> -#define USE_REG_TB  (!HAVE_FACILITY(GEN_INST_EXT))
> -
>  #ifndef CONFIG_SOFTMMU
>  #define TCG_GUEST_BASE_REG TCG_REG_R13
>  #endif
> @@ -813,8 +807,8 @@ static bool maybe_out_small_movi(TCGContext *s, TCGType 
> type,
>  }
>  
>  /* load a register with an immediate value */
> -static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
> - tcg_target_long sval, bool in_prologue)
> +static void tcg_out_movi(TCGContext *s, TCGType type,
> + TCGReg ret, tcg_target_long sval)
>  {
>  tcg_target_ulong uval;
>  
> @@ -853,14 +847,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType 
> type, TCGReg ret,
>  tcg_out_insn(s, RIL, LARL, ret, off);
>  return;
>  }
> -} else if (USE_REG_TB && !in_prologue) {
> -ptrdiff_t off = tcg_tbrel_diff(s, (void *)sval);
> -if (off == sextract64(off, 0, 20)) {
> -/* This is certain to be an address within TB, and therefore
> -   OFF will be negative; don't try RX_LA.  */
> -tcg_out_insn(s, RXY, LAY, ret, TCG_REG_TB, TCG_REG_NONE, off);
> -return;
> -}
>  }
>  
>  /* A 32-bit unsigned value can be loaded in 2 insns.  And given
> @@ -876,10 +862,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType 
> type, TCGReg ret,
>  if (HAVE_FACILITY(GEN_INST_EXT)) {
>  tcg_out_insn(s, RIL, LGRL, ret, 0);
>  new_pool_label(s, sval, R_390_PC32DBL, s->code_ptr - 2, 2);
> -} else if (USE_REG_TB && !in_prologue) {
> -tcg_out_insn(s, RXY, LG, ret, TCG_REG_TB, TCG_REG_NONE, 0);
> -new_pool_label(s, sval, R_390_20, s->code_ptr - 2,
> -   tcg_tbrel_diff(s, NULL));
>  } else {
>  TCGReg base = ret ? ret : TCG_TMP0;
>  tcg_out_insn(s, RIL, LARL, base, 0);
> @@ -888,12 +870,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType 
> type, TCGReg ret,
>  }
>  }

I did some benchmarking of various ways to load constants in context of
GCC in the past, and it turned out that LLIHF+OILF is more efficient
than literal pool [1].

> -static void tcg_out_movi(TCGContext *s, TCGType type,
> - TCGReg ret, tcg_target_long sval)
> -{
> -tcg_out_movi_int(s, type, ret, sval, false);
> -}
> -
>  /* Emit a load/store type instruction.  Inputs are:
> DATA: The register to be loaded or stored.
> BASE+OFS: The effective address.
> @@ -1020,35 +996,6 @@ static inline bool tcg_out_sti(TCGContext *s, TCGType 
> type, TCGArg val,
>  return false;
>  }
>  
> -/* load data from an absolute host address */
> -static void tcg_out_ld_abs(TCGContext *s, TCGType type,
> -   TCGReg dest, co

Re: [PATCH v3 29/34] tcg: Reorg function calls

2022-12-06 Thread Ilya Leoshkevich

On Tue, 2022-12-06 at 09:49 -0600, Richard Henderson wrote:
> On 12/6/22 09:28, Ilya Leoshkevich wrote:
> > > +    switch (TCG_TARGET_CALL_ARG_I64) {
> > > +    case TCG_CALL_ARG_EVEN:
> > 
> > On a s390x host with gcc-11.0.1-0.3.1.ibm.fc34.s390x I get:
> > 
> > FAILED: libqemu-aarch64-softmmu.fa.p/tcg_tcg.c.o
> > ../tcg/tcg.c: In function ‘init_call_layout’:
> > ../tcg/tcg.c:739:13: error: case value ‘1’ not in enumerated type
> > [-Werror=switch]
> >    739 | case TCG_CALL_ARG_EVEN:
> >    | ^~~~
> > 
> > The following helps:
> 
> Yes, I found and fixed this since.
> 
> > --- a/tcg/tcg.c
> > +++ b/tcg/tcg.c
> > @@ -735,7 +735,7 @@ static void init_call_layout(TCGHelperInfo
> > *info)
> >   break;
> >   
> >   case TCG_TYPE_I64:
> > -    switch (TCG_TARGET_CALL_ARG_I64) {
> > +    switch ((TCGCallArgumentKind)TCG_TARGET_CALL_ARG_I64)
> > {
> >   case TCG_CALL_ARG_EVEN:
> >   layout_arg_even();
> >   /* fall through */
> > 
> > This looks like a gcc bug to me.
> 
> The gcc "bug" is only in not being sufficiently verbose.  It should
> say something about 
> *differing* enumerated types, and perhaps name them.
> 
> Back in patch 20, tcg/s390x/tcg-target.h,
> 
> -#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_RET_NORMAL
> +#define TCG_TARGET_CALL_ARG_I64 TCG_CALL_ARG_NORMAL
> 
> 
> r~

I looked at this line and completely missed the RET vs ARG difference.
Your diff fixes the issue for me too, of course.
Thanks!

Re: [PATCH v3 01/13] tcg/s390x: Use register pair allocation for div and mulu2

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 10:51:48PM -0800, Richard Henderson wrote:
> Previously we hard-coded R2 and R3.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target-con-set.h |  4 ++--
>  tcg/s390x/tcg-target-con-str.h |  8 +--
>  tcg/s390x/tcg-target.c.inc | 43 +-
>  3 files changed, 35 insertions(+), 20 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v3 29/34] tcg: Reorg function calls

2022-12-06 Thread Ilya Leoshkevich

On Thu, Dec 01, 2022 at 09:39:53PM -0800, Richard Henderson wrote:
> Pre-compute the function call layout for each helper at startup.
> Drop TCG_CALL_DUMMY_ARG, as we no longer need to leave gaps
> in the op->args[] array.  This allows several places to stop
> checking for NULL TCGTemp, to which TCG_CALL_DUMMY_ARG mapped.
> 
> For tcg_gen_callN, loop over the arguments once.  Allocate the TCGOp
> for the call early but delay emitting it, collecting arguments first.
> This allows the argument processing loop to emit code for extensions
> and have them sequenced before the call.
> 
> For tcg_reg_alloc_call, loop over the arguments in reverse order,
> which allows stack slots to be filled first naturally.
> 
> Signed-off-by: Richard Henderson 
> ---
>  include/exec/helper-head.h |   2 +
>  include/tcg/tcg.h  |   5 +-
>  tcg/tcg-internal.h |  22 +-
>  tcg/optimize.c |   6 +-
>  tcg/tcg.c  | 609 ++---
>  5 files changed, 394 insertions(+), 250 deletions(-)

...

> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index d08323db49..74f7491d73 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -547,7 +547,7 @@ void tcg_pool_reset(TCGContext *s)
>  
>  #include "exec/helper-proto.h"
>  
> -static const TCGHelperInfo all_helpers[] = {
> +static TCGHelperInfo all_helpers[] = {
>  #include "exec/helper-tcg.h"
>  };
>  static GHashTable *helper_table;
> @@ -565,6 +565,154 @@ static ffi_type * const typecode_to_ffi[8] = {
>  };
>  #endif
>  
> +typedef struct TCGCumulativeArgs {
> +int arg_idx;/* tcg_gen_callN args[] */
> +int info_in_idx;/* TCGHelperInfo in[] */
> +int arg_slot;   /* regs+stack slot */
> +int ref_slot;   /* stack slots for references */
> +} TCGCumulativeArgs;
> +
> +static void layout_arg_even(TCGCumulativeArgs *cum)
> +{
> +cum->arg_slot += cum->arg_slot & 1;
> +}
> +
> +static void layout_arg_1(TCGCumulativeArgs *cum, TCGHelperInfo *info,
> + TCGCallArgumentKind kind)
> +{
> +TCGCallArgumentLoc *loc = >in[cum->info_in_idx];
> +
> +*loc = (TCGCallArgumentLoc){
> +.kind = kind,
> +.arg_idx = cum->arg_idx,
> +.arg_slot = cum->arg_slot,
> +};
> +cum->info_in_idx++;
> +cum->arg_slot++;
> +}
> +
> +static void layout_arg_normal_n(TCGCumulativeArgs *cum,
> +TCGHelperInfo *info, int n)
> +{
> +TCGCallArgumentLoc *loc = >in[cum->info_in_idx];
> +
> +for (int i = 0; i < n; ++i) {
> +/* Layout all using the same arg_idx, adjusting the subindex. */
> +loc[i] = (TCGCallArgumentLoc){
> +.kind = TCG_CALL_ARG_NORMAL,
> +.arg_idx = cum->arg_idx,
> +.tmp_subindex = i,
> +.arg_slot = cum->arg_slot + i,
> +};
> +}
> +cum->info_in_idx += n;
> +cum->arg_slot += n;
> +}
> +
> +static void init_call_layout(TCGHelperInfo *info)
> +{
> +int max_reg_slots = ARRAY_SIZE(tcg_target_call_iarg_regs);
> +int max_stk_slots = TCG_STATIC_CALL_ARGS_SIZE / sizeof(tcg_target_long);
> +unsigned typemask = info->typemask;
> +unsigned typecode;
> +TCGCumulativeArgs cum = { };
> +
> +/*
> + * Parse and place any function return value.
> + */
> +typecode = typemask & 7;
> +switch (typecode) {
> +case dh_typecode_void:
> +info->nr_out = 0;
> +break;
> +case dh_typecode_i32:
> +case dh_typecode_s32:
> +case dh_typecode_ptr:
> +info->nr_out = 1;
> +info->out_kind = TCG_CALL_RET_NORMAL;
> +break;
> +case dh_typecode_i64:
> +case dh_typecode_s64:
> +info->nr_out = 64 / TCG_TARGET_REG_BITS;
> +info->out_kind = TCG_CALL_RET_NORMAL;
> +break;
> +default:
> +g_assert_not_reached();
> +}
> +assert(info->nr_out <= ARRAY_SIZE(tcg_target_call_oarg_regs));
> +
> +/*
> + * Parse and place function arguments.
> + */
> +for (typemask >>= 3; typemask; typemask >>= 3, cum.arg_idx++) {
> +TCGCallArgumentKind kind;
> +TCGType type;
> +
> +typecode = typemask & 7;
> +switch (typecode) {
> +case dh_typecode_i32:
> +case dh_typecode_s32:
> +type = TCG_TYPE_I32;
> +break;
> +case dh_typecode_i64:
> +case dh_typecode_s64:
> +type = TCG_TYPE_I64;
> +break;
> +case dh_typecode_ptr:
> +type = TCG_TYPE_PTR;
> +break;
> +default:
> +g_assert_not_reached();
> +}
> +
> +switch (type) {
> +case TCG_TYPE_I32:
> +switch (TCG_TARGET_CALL_ARG_I32) {
> +case TCG_CALL_ARG_EVEN:
> +layout_arg_even();
> +/* fall through */
> +case TCG_CALL_ARG_NORMAL:
> +layout_arg_1(, info, TCG_CALL_ARG_NORMAL);
> +break;
> +

Re: [PATCH for-8.0] target/s390x: The MVCP and MVCS instructions are not privileged

2022-12-06 Thread Ilya Leoshkevich

On Mon, 2022-12-05 at 13:58 +0100, Thomas Huth wrote:
> The "MOVE TO PRIMARY/SECONDARY" instructions can also be called
> from problem state. We just should properly check whether the
> secondary-space access key is valid here, too, and inject a
> privileged program exception if it is invalid.
> 
> Signed-off-by: Thomas Huth 
> ---
>  Found only by code inspection - I'm not aware yet of any problem
>  in the wild due to this bug.
> 
>  target/s390x/helper.h    |  4 ++--
>  target/s390x/tcg/insn-data.h.inc |  4 ++--
>  target/s390x/tcg/mem_helper.c    | 16 
>  target/s390x/tcg/translate.c |  6 --
>  4 files changed, 20 insertions(+), 10 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH v2 for-8.0] target/s390x/tcg: Fix and improve the SACF instruction

2022-12-02 Thread Ilya Leoshkevich

On Thu, 2022-12-01 at 19:44 +0100, Thomas Huth wrote:
> The SET ADDRESS SPACE CONTROL FAST instruction is not privileged, it
> can be
> used from problem space, too. Just the switching to the home address
> space
> is privileged and should still generate a privilege exception. This
> bug is
> e.g. causing programs like Java that use the "getcpu" vdso kernel
> function
> to crash (see
> https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=990417#26 ).
> 
> While we're at it, also check if DAT is not enabled. In that case the
> instruction is supposed to generate a special operation exception.
> 
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/655
> Signed-off-by: Thomas Huth 
> ---
>  target/s390x/tcg/insn-data.h.inc | 2 +-
>  target/s390x/tcg/cc_helper.c | 7 +++
>  2 files changed, 8 insertions(+), 1 deletion(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH-for-8.0 1/2] target/s390x: Replace TCGv by TCGv_i64 in op_mov2e()

2022-12-01 Thread Ilya Leoshkevich

On Wed, 2022-11-30 at 17:34 +0100, Philippe Mathieu-Daudé wrote:
> Although TCGv is defined as TCGv_i64 on s390x,
> make it clear tcg_temp_new_i64() returns a TCGv_i64.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  target/s390x/tcg/translate.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/target/s390x/tcg/translate.c
> b/target/s390x/tcg/translate.c
> index 1e599ac259..a77039b863 100644
> --- a/target/s390x/tcg/translate.c
> +++ b/target/s390x/tcg/translate.c
> @@ -3335,7 +3335,7 @@ static DisasJumpType op_mov2(DisasContext *s,
> DisasOps *o)
>  static DisasJumpType op_mov2e(DisasContext *s, DisasOps *o)
>  {
>  int b2 = get_field(s, b2);
> -    TCGv ar1 = tcg_temp_new_i64();
> +    TCGv_i64 ar1 = tcg_temp_new_i64();
>  
>      o->out = o->in2;
>  o->g_out = o->g_in2;

Reviewed-by: Ilya Leoshkevich 

It looks as if besides sparc and s390x there is one occurrence of this
in alpha?

$ git grep -w TCGv | grep -w tcg_temp_new_i64
target/alpha/translate.c:TCGv tmp = tcg_temp_new_i64();

Re: [PATCH 25/26] tcg: Introduce tcg_temp_is_normal_*

2022-11-30 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:20PM -0700, Richard Henderson wrote:
> Allow targets to determine if a given temp will die across a branch.
> 
> Reviewed-by: Philippe Mathieu-Daudé 
> Signed-off-by: Richard Henderson 
> ---
>  include/tcg/tcg-op.h |  2 ++
>  include/tcg/tcg.h| 15 +++
>  2 files changed, 17 insertions(+)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH 24/26] tcg: Introduce tcg_temp_ebb_new_*

2022-11-30 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:19PM -0700, Richard Henderson wrote:
> Allow targets to allocate extended-basic-block temps.
> 
> Signed-off-by: Richard Henderson 
> ---
>  include/tcg/tcg-op.h |  2 ++
>  include/tcg/tcg.h| 20 +++-
>  tcg/tcg.c| 16 
>  3 files changed, 25 insertions(+), 13 deletions(-)
> 
> diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
> index 209e168305..0ebbee6e24 100644
> --- a/include/tcg/tcg-op.h
> +++ b/include/tcg/tcg-op.h
> @@ -848,6 +848,7 @@ static inline void tcg_gen_plugin_cb_end(void)
>  #define tcg_temp_new() tcg_temp_new_i32()
>  #define tcg_global_mem_new tcg_global_mem_new_i32
>  #define tcg_temp_local_new() tcg_temp_local_new_i32()
> +#define tcg_temp_ebb_new() tcg_temp_ebb_new_i32()
>  #define tcg_temp_free tcg_temp_free_i32
>  #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i32
>  #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i32
> @@ -855,6 +856,7 @@ static inline void tcg_gen_plugin_cb_end(void)
>  #define tcg_temp_new() tcg_temp_new_i64()
>  #define tcg_global_mem_new tcg_global_mem_new_i64
>  #define tcg_temp_local_new() tcg_temp_local_new_i64()
> +#define tcg_temp_ebb_new() tcg_temp_ebb_new_i64()
>  #define tcg_temp_free tcg_temp_free_i64
>  #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i64
>  #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i64
> diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
> index e01a47ec20..3835711d52 100644
> --- a/include/tcg/tcg.h
> +++ b/include/tcg/tcg.h
> @@ -609,7 +609,7 @@ struct TCGContext {
>  #endif
>  
>  GHashTable *const_table[TCG_TYPE_COUNT];
> -TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
> +TCGTempSet free_temps[TCG_TYPE_COUNT * 3];
>  TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
>  
>  QTAILQ_HEAD(, TCGOp) ops, free_ops;
> @@ -890,6 +890,12 @@ static inline TCGv_i32 tcg_temp_local_new_i32(void)
>  return temp_tcgv_i32(t);
>  }
>  
> +static inline TCGv_i32 tcg_temp_ebb_new_i32(void)
> +{
> +TCGTemp *t = tcg_temp_new_internal(TCG_TYPE_I32, TEMP_EBB);
> +return temp_tcgv_i32(t);
> +}
> +
>  static inline TCGv_i64 tcg_global_mem_new_i64(TCGv_ptr reg, intptr_t offset,
>const char *name)
>  {
> @@ -909,6 +915,12 @@ static inline TCGv_i64 tcg_temp_local_new_i64(void)
>  return temp_tcgv_i64(t);
>  }
>  
> +static inline TCGv_i64 tcg_temp_ebb_new_i64(void)
> +{
> +TCGTemp *t = tcg_temp_new_internal(TCG_TYPE_I64, TEMP_EBB);
> +return temp_tcgv_i64(t);
> +}
> +
>  static inline TCGv_ptr tcg_global_mem_new_ptr(TCGv_ptr reg, intptr_t offset,
>const char *name)
>  {
> @@ -928,6 +940,12 @@ static inline TCGv_ptr tcg_temp_local_new_ptr(void)
>  return temp_tcgv_ptr(t);
>  }
>  
> +static inline TCGv_ptr tcg_temp_ebb_new_ptr(void)
> +{
> +TCGTemp *t = tcg_temp_new_internal(TCG_TYPE_PTR, TEMP_EBB);
> +return temp_tcgv_ptr(t);
> +}
> +
>  #if defined(CONFIG_DEBUG_TCG)
>  /* If you call tcg_clear_temp_count() at the start of a section of
>   * code which is not supposed to leak any TCG temporaries, then
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index acdbd5a9a2..7aa6cc3451 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -948,17 +948,8 @@ TCGTemp *tcg_temp_new_internal(TCGType type, TCGTempKind 
> kind)
>  TCGTemp *ts;
>  int idx, k;
>  
> -switch (kind) {
> -case TEMP_NORMAL:
> -k = 0;
> -break;
> -case TEMP_LOCAL:
> -k = TCG_TYPE_COUNT;
> -break;
> -default:
> -g_assert_not_reached();
> -}
> -k += type;
> +assert(kind >= TEMP_NORMAL && kind <= TEMP_LOCAL);

Nit: maybe also add QEMU_BUILD_BUG_ON(TEMP_NORMAL != 0)
and QEMU_BUILD_BUG_ON(TEMP_LOCAL != 2), since we are using this for
0-based array indexing here? Alternatively, subtract TEMP_NORMAL
from kind.

> +k = TCG_TYPE_COUNT * kind + type;
>  
>  idx = find_first_bit(s->free_temps[k].l, TCG_MAX_TEMPS);
>  if (idx < TCG_MAX_TEMPS) {
> @@ -1046,6 +1037,7 @@ void tcg_temp_free_internal(TCGTemp *ts)
>   */
>  return;
>  case TEMP_NORMAL:
> +case TEMP_EBB:
>  case TEMP_LOCAL:
>  break;
>  default:
> @@ -1063,7 +1055,7 @@ void tcg_temp_free_internal(TCGTemp *ts)
>  ts->temp_allocated = 0;
>  
>  idx = temp_idx(ts);
> -k = ts->base_type + (ts->kind == TEMP_NORMAL ? 0 : TCG_TYPE_COUNT);
> +k = ts->base_type + ts->kind * TCG_TYPE_COUNT;
>  set_bit(idx, s->free_temps[k].l);
>  }
>  
> -- 
> 2.34.1
> 
> 

Reviewed-by: Ilya Leoshkevich 

While not directly related to this patch, it would be good to update
tcg/README with all the new kinds of temporaries. E.g. the EBB ones are
not mentioned there:

TCG instructions operate on variables which are temporaries, local
temporaries or globals.

Re: [PATCH 23/26] tcg: Pass TCGTempKind to tcg_temp_new_internal

2022-11-30 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:18PM -0700, Richard Henderson wrote:
> Reviewed-by: Philippe Mathieu-Daudé 
> Signed-off-by: Richard Henderson 
> ---
>  include/tcg/tcg.h | 14 +++---
>  tcg/tcg.c | 20 +++-
>  2 files changed, 22 insertions(+), 12 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH 22/26] target/s390x: Pass original r2 register to BCR

2022-11-30 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:17PM -0700, Richard Henderson wrote:
> We do not modify any general-purpose registers in BCR,
> which means that we may be able to avoid saving the
> value across a branch.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c   | 10 ++
>  target/s390x/tcg/insn-data.def |  2 +-
>  2 files changed, 11 insertions(+), 1 deletion(-)

Reviewed-by: Ilya Leoshkevich

[PATCH 2/2] tests/tcg/s390x: Add per.S

2022-11-30 Thread Ilya Leoshkevich

Add a small test to avoid regressions.

Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/s390x/Makefile.softmmu-target |  1 +
 tests/tcg/s390x/per.S   | 55 +
 2 files changed, 56 insertions(+)
 create mode 100644 tests/tcg/s390x/per.S

diff --git a/tests/tcg/s390x/Makefile.softmmu-target 
b/tests/tcg/s390x/Makefile.softmmu-target
index a34fa68473e..1d649753f4e 100644
--- a/tests/tcg/s390x/Makefile.softmmu-target
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -7,3 +7,4 @@ QEMU_OPTS=-action panic=exit-failure -kernel
-Wl,--build-id=none $< -o $@
 
 TESTS += unaligned-lowcore
+TESTS += per
diff --git a/tests/tcg/s390x/per.S b/tests/tcg/s390x/per.S
new file mode 100644
index 000..02f8422c44d
--- /dev/null
+++ b/tests/tcg/s390x/per.S
@@ -0,0 +1,55 @@
+#define N_ITERATIONS 10
+
+.org 0x8d
+ilc:
+.org 0x8e
+program_interruption_code:
+.org 0x96
+per_code:
+.org 0x150
+program_old_psw:
+.org 0x1d0 /* program new PSW */
+.quad 0,pgm_handler
+.org 0x200 /* lowcore padding */
+
+.globl _start
+_start:
+lpswe per_on_psw
+start_per:
+lghi %r0,N_ITERATIONS
+xgr %r1,%r1
+lctlg %c9,%c11,per_on_regs
+loop:
+brct %r0,loop
+lctlg %c9,%c11,per_off_regs
+cgijne %r1,N_ITERATIONS-1,fail /* expected number of events? */
+lpswe success_psw
+fail:
+lpswe failure_psw
+
+pgm_handler:
+chhsi program_interruption_code,0x80 /* PER event? */
+jne fail
+cli per_code,0x80  /* successful-branching event? */
+jne fail
+la %r1,1(%r1)  /* increment event counter */
+mvc return_psw(8),program_old_psw
+lg %r3,program_old_psw+8
+llgc %r2,ilc
+sgr %r3,%r2/* rewind PSW */
+stg %r3,return_psw+8
+lpswe return_psw
+
+.align 8
+per_on_psw:
+.quad 0x4000,start_per
+per_on_regs:
+.quad 0x8000,0,-1  /* successful-branching everywhere */
+per_off_regs:
+.quad 0,0,0
+return_psw:
+.quad 0,0
+success_psw:
+.quad 0x2,0xfff/* see is_special_wait_psw() */
+failure_psw:
+.quad 0x2,0/* disabled wait */
-- 
2.38.1

[PATCH 1/2] target/s390x: Fix successful-branch PER events

2022-11-30 Thread Ilya Leoshkevich

The branching code sets per_perc_atmid, but afterwards it does
goto_tb/exit_tb, so per_check_exception() added by translate_one() is
not reached.

Fix by raising PER exception in per_branch().

Signed-off-by: Ilya Leoshkevich 
---
 target/s390x/tcg/misc_helper.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/s390x/tcg/misc_helper.c b/target/s390x/tcg/misc_helper.c
index 71388a71197..b7220cef44b 100644
--- a/target/s390x/tcg/misc_helper.c
+++ b/target/s390x/tcg/misc_helper.c
@@ -619,6 +619,7 @@ void HELPER(per_branch)(CPUS390XState *env, uint64_t from, 
uint64_t to)
 || get_per_in_range(env, to)) {
 env->per_address = from;
 env->per_perc_atmid = PER_CODE_EVENT_BRANCH | get_per_atmid(env);
+tcg_s390_program_interrupt(env, PGM_PER, GETPC());
 }
 }
 }
-- 
2.38.1

Re: [PATCH 21/26] target/s390x: Remove PER check from use_goto_tb

2022-11-30 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:16PM -0700, Richard Henderson wrote:
> While it is common for the PC update to happen in the
> shadow of a goto_tb, it is not required to be there.
> By moving it before the goto_tb, we can also place the
> call to helper_per_branch there, and then afterward
> chain to the next tb.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c | 10 --
>  1 file changed, 4 insertions(+), 6 deletions(-)
> 
> diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
> index a2315ac73e..e6c7c2a6ae 100644
> --- a/target/s390x/tcg/translate.c
> +++ b/target/s390x/tcg/translate.c
> @@ -654,9 +654,6 @@ static void gen_op_calc_cc(DisasContext *s)
>  
>  static bool use_goto_tb(DisasContext *s, uint64_t dest)
>  {
> -if (per_enabled(s)) {
> -return false;
> -}
>  return translator_use_goto_tb(>base, dest);
>  }
>  
> @@ -1157,15 +1154,16 @@ static DisasJumpType help_goto_direct(DisasContext 
> *s, int64_t disp)
>  per_branch_disp(s, disp);
>  return DISAS_NEXT;
>  }
> +
> +update_psw_addr_disp(s, disp);
> +per_branch_dest(s, psw_addr);
> +
>  if (use_goto_tb(s, s->base.pc_next + disp)) {
>  update_cc_op(s);
>  tcg_gen_goto_tb(0);
> -update_psw_addr_disp(s, disp);
>  tcg_gen_exit_tb(s->base.tb, 0);
>  return DISAS_NORETURN;
>  } else {
> -update_psw_addr_disp(s, disp);
> -per_branch_dest(s, psw_addr);
>  return DISAS_PC_UPDATED;
>  }
>  }
> -- 
> 2.34.1

While looking at this, I had a hard time convincing myself that
successful-branch PER events work at all. The code does set
per_perc_atmid, but afterwards it does goto_tb/exit_tb, and does
not reach per_check_exception() added by translate_one().

I wrote a small test for this theory by turning on PER for
successful-branch events and looping 10 times. It passes in KVM, but
fails in TCG. Here is the relevant IR fragment:

IN: 
0x0212:  a706    brct %r0, 0x212

OP:
 ld_i32 tmp0,env,$0xfff0
 brcond_i32 tmp0,$0x0,lt,$L0

  0212 0004 0004
 mov_i64 tmp2,psw_addr
 call per_ifetch,$0x1,$0,env,tmp2
 sub_i64 tmp2,r0,$0x1
 extract2_i64 r0,r0,tmp2,$0x20
 rotl_i64 r0,r0,$0x20
 mov_i32 tmp0,tmp2
 brcond_i32 tmp0,$0x0,eq,$L1
 /* branch taken: set per_perc_atmid and exit */
 mov_i64 gbea,psw_addr
 call per_branch,$0x1,$0,env,gbea,psw_addr
 goto_tb $0x0
 exit_tb $0x7f73fc000400
 set_label $L1
 add_i64 psw_addr,psw_addr,$0x4
 goto_tb $0x1
 exit_tb $0x7f73fc000401
 /* check per_perc_atmid */
 call per_check_exception,$0x0,$0,env
 set_label $L0
 exit_tb $0x7f73fc000403

I will post the proposed fix and the test itself shortly.

That said, the patch makes sense to me and does not make things worse,
so:

Reviewed-by: Ilya Leoshkevich

Re: [PATCH 20/26] target/s390x: Split per_breaking_event from per_branch_*

2022-11-30 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:15PM -0700, Richard Henderson wrote:
> This allows us to update gbea before other updates to psw_addr,
> which will be important for TARGET_TB_PCREL.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH 19/26] target/s390x: Simplify help_branch

2022-11-30 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:14PM -0700, Richard Henderson wrote:
> Always use a tcg branch, instead of movcond.  The movcond
> was not a bad idea before PER was added, but since then
> we have either 2 or 3 actions to perform on each leg of
> the branch, and multiple movcond is inefficient.
> 
> Reorder the taken branch to be fallthrough of the tcg branch.
> This will be helpful later with TARGET_TB_PCREL.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c | 154 ++-
>  1 file changed, 44 insertions(+), 110 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH 17/26] target/s390x: Introduce help_goto_indirect

2022-11-30 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:12PM -0700, Richard Henderson wrote:
> Add a small helper to handle unconditional indirect jumps.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c | 19 ++-
>  1 file changed, 10 insertions(+), 9 deletions(-)

Reviewed-by: Ilya Leoshkevich

[PATCH v3] tests/tcg/s390x: Add cdsg.c

2022-11-29 Thread Ilya Leoshkevich

Add a simple test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---

Sorry, I just realized that in v2 that I sent the iteration count was
not increased. For v3 I've decided to bump it further to 1m, since it's
still fast enough:

$ time -p ./qemu-s390x ./tests/tcg/s390x-linux-user/cdsg
real 0.15

v2 -> v3: Increase iteration count to 1m.
v1 -> v2: Add cdsg() wrapper.

 tests/tcg/s390x/Makefile.target |  4 ++
 tests/tcg/s390x/cdsg.c  | 85 +
 2 files changed, 89 insertions(+)
 create mode 100644 tests/tcg/s390x/cdsg.c

diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target
index 1d454270c0e..523214dac33 100644
--- a/tests/tcg/s390x/Makefile.target
+++ b/tests/tcg/s390x/Makefile.target
@@ -27,6 +27,7 @@ TESTS+=noexec
 TESTS+=div
 TESTS+=clst
 TESTS+=long-double
+TESTS+=cdsg
 
 Z13_TESTS=vistr
 $(Z13_TESTS): CFLAGS+=-march=z13 -O2
@@ -66,3 +67,6 @@ sha512-mvx: sha512.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS)
 
 TESTS+=sha512-mvx
+
+cdsg: CFLAGS+=-pthread
+cdsg: LDFLAGS+=-pthread
diff --git a/tests/tcg/s390x/cdsg.c b/tests/tcg/s390x/cdsg.c
new file mode 100644
index 000..c7a5246181d
--- /dev/null
+++ b/tests/tcg/s390x/cdsg.c
@@ -0,0 +1,85 @@
+#include 
+#include 
+#include 
+#include 
+
+static volatile bool start;
+typedef unsigned long aligned_quadword[2] __attribute__((__aligned__(16)));
+static aligned_quadword val;
+static const int n_iterations = 100;
+
+static inline int cdsg(unsigned long *orig0, unsigned long *orig1,
+   unsigned long new0, unsigned long new1,
+   aligned_quadword *mem)
+{
+register unsigned long r0 asm("r0");
+register unsigned long r1 asm("r1");
+register unsigned long r2 asm("r2");
+register unsigned long r3 asm("r3");
+int cc;
+
+r0 = *orig0;
+r1 = *orig1;
+r2 = new0;
+r3 = new1;
+asm("cdsg %[r0],%[r2],%[db2]\n"
+"ipm %[cc]"
+: [r0] "+r" (r0)
+, [r1] "+r" (r1)
+, [db2] "+m" (*mem)
+, [cc] "=r" (cc)
+: [r2] "r" (r2)
+, [r3] "r" (r3)
+: "cc");
+*orig0 = r0;
+*orig1 = r1;
+
+return (cc >> 28) & 3;
+}
+
+void *cdsg_loop(void *arg)
+{
+unsigned long orig0, orig1, new0, new1;
+int cc;
+int i;
+
+while (!start) {
+}
+
+orig0 = val[0];
+orig1 = val[1];
+for (i = 0; i < n_iterations;) {
+new0 = orig0 + 1;
+new1 = orig1 + 2;
+
+cc = cdsg(, , new0, new1, );
+
+if (cc == 0) {
+orig0 = new0;
+orig1 = new1;
+i++;
+} else {
+assert(cc == 1);
+}
+}
+
+return NULL;
+}
+
+int main(void)
+{
+pthread_t thread;
+int ret;
+
+ret = pthread_create(, NULL, cdsg_loop, NULL);
+assert(ret == 0);
+start = true;
+cdsg_loop(NULL);
+ret = pthread_join(thread, NULL);
+assert(ret == 0);
+
+assert(val[0] == n_iterations * 2);
+assert(val[1] == n_iterations * 4);
+
+return EXIT_SUCCESS;
+}
-- 
2.38.1

[PATCH v2] tests/tcg/s390x: Add cdsg.c

2022-11-29 Thread Ilya Leoshkevich

Add a simple test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/s390x/Makefile.target |  4 ++
 tests/tcg/s390x/cdsg.c  | 84 +
 2 files changed, 88 insertions(+)
 create mode 100644 tests/tcg/s390x/cdsg.c

diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target
index 1d454270c0e..523214dac33 100644
--- a/tests/tcg/s390x/Makefile.target
+++ b/tests/tcg/s390x/Makefile.target
@@ -27,6 +27,7 @@ TESTS+=noexec
 TESTS+=div
 TESTS+=clst
 TESTS+=long-double
+TESTS+=cdsg
 
 Z13_TESTS=vistr
 $(Z13_TESTS): CFLAGS+=-march=z13 -O2
@@ -66,3 +67,6 @@ sha512-mvx: sha512.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS)
 
 TESTS+=sha512-mvx
+
+cdsg: CFLAGS+=-pthread
+cdsg: LDFLAGS+=-pthread
diff --git a/tests/tcg/s390x/cdsg.c b/tests/tcg/s390x/cdsg.c
new file mode 100644
index 000..28b5ac9a000
--- /dev/null
+++ b/tests/tcg/s390x/cdsg.c
@@ -0,0 +1,84 @@
+#include 
+#include 
+#include 
+#include 
+
+static volatile bool start;
+typedef unsigned long aligned_quadword[2] __attribute__((__aligned__(16)));
+static aligned_quadword val;
+
+static inline int cdsg(unsigned long *orig0, unsigned long *orig1,
+   unsigned long new0, unsigned long new1,
+   aligned_quadword *mem)
+{
+register unsigned long r0 asm("r0");
+register unsigned long r1 asm("r1");
+register unsigned long r2 asm("r2");
+register unsigned long r3 asm("r3");
+int cc;
+
+r0 = *orig0;
+r1 = *orig1;
+r2 = new0;
+r3 = new1;
+asm("cdsg %[r0],%[r2],%[db2]\n"
+"ipm %[cc]"
+: [r0] "+r" (r0)
+, [r1] "+r" (r1)
+, [db2] "+m" (*mem)
+, [cc] "=r" (cc)
+: [r2] "r" (r2)
+, [r3] "r" (r3)
+: "cc");
+*orig0 = r0;
+*orig1 = r1;
+
+return (cc >> 28) & 3;
+}
+
+void *cdsg_loop(void *arg)
+{
+unsigned long orig0, orig1, new0, new1;
+int cc;
+int i;
+
+while (!start) {
+}
+
+orig0 = val[0];
+orig1 = val[1];
+for (i = 0; i < 1000;) {
+new0 = orig0 + 1;
+new1 = orig1 + 2;
+
+cc = cdsg(, , new0, new1, );
+
+if (cc == 0) {
+orig0 = new0;
+orig1 = new1;
+i++;
+} else {
+assert(cc == 1);
+}
+}
+
+return NULL;
+}
+
+int main(void)
+{
+pthread_t thread;
+int ret;
+
+ret = pthread_create(, NULL, cdsg_loop, NULL);
+assert(ret == 0);
+start = true;
+cdsg_loop(NULL);
+ret = pthread_join(thread, NULL);
+assert(ret == 0);
+
+assert(val[0] == 2000);
+assert(val[1] == 4000);
+
+return EXIT_SUCCESS;
+}
-- 
2.38.1

Re: [PATCH] tests/tcg/s390x: Add cdsg.c

2022-11-29 Thread Ilya Leoshkevich

On Tue, Nov 29, 2022 at 09:54:13AM +0100, David Hildenbrand wrote:
> On 29.11.22 00:48, Ilya Leoshkevich wrote:
> > Add a simple test to prevent regressions.
> > 
> > Signed-off-by: Ilya Leoshkevich 
> > ---
> >   tests/tcg/s390x/Makefile.target |  4 ++
> >   tests/tcg/s390x/cdsg.c  | 73 +
> >   2 files changed, 77 insertions(+)
> >   create mode 100644 tests/tcg/s390x/cdsg.c
> > 
> > diff --git a/tests/tcg/s390x/Makefile.target 
> > b/tests/tcg/s390x/Makefile.target
> > index 1d454270c0e..523214dac33 100644
> > --- a/tests/tcg/s390x/Makefile.target
> > +++ b/tests/tcg/s390x/Makefile.target
> > @@ -27,6 +27,7 @@ TESTS+=noexec
> >   TESTS+=div
> >   TESTS+=clst
> >   TESTS+=long-double
> > +TESTS+=cdsg
> >   Z13_TESTS=vistr
> >   $(Z13_TESTS): CFLAGS+=-march=z13 -O2
> > @@ -66,3 +67,6 @@ sha512-mvx: sha512.c
> > $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS)
> >   TESTS+=sha512-mvx
> > +
> > +cdsg: CFLAGS+=-pthread
> > +cdsg: LDFLAGS+=-pthread
> > diff --git a/tests/tcg/s390x/cdsg.c b/tests/tcg/s390x/cdsg.c
> > new file mode 100644
> > index 000..83313699f7d
> > --- /dev/null
> > +++ b/tests/tcg/s390x/cdsg.c
> > @@ -0,0 +1,73 @@
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +static volatile bool start;
> > +static unsigned long val[2] __attribute__((__aligned__(16)));
> > +
> > +void *cdsg_loop(void *arg)
> > +{
> > +unsigned long orig0, orig1, new0, new1;
> > +register unsigned long r0 asm("r0");
> > +register unsigned long r1 asm("r1");
> > +register unsigned long r2 asm("r2");
> > +register unsigned long r3 asm("r3");
> > +int cc;
> > +int i;
> > +
> > +while (!start) {
> > +}
> > +
> > +orig0 = val[0];
> > +orig1 = val[1];
> > +for (i = 0; i < 1000;) {
> 
> Are 1000 iterations sufficient to catch the race window reliably?

Good point, I had to raise it to 10k.
If I break the code like this:

--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -3509,7 +3509,7 @@ void tcg_gen_atomic_cmpxchg_i128(TCGv_i128 retv, TCGv 
addr, TCGv_i128 cmpv,
 {
 gen_atomic_cx_i128 gen;
 
-if (!(tcg_ctx->tb_cflags & CF_PARALLEL)) {
+if (true) {
 tcg_gen_nonatomic_cmpxchg_i128(retv, addr, cmpv, newv, idx, memop);
 return;
 }
 
the test with 10k iterations fails consistently.
And it's still fast:

$ time -p ./qemu-s390x ./tests/tcg/s390x-linux-user/cdsg
real 0.01

> > +new0 = orig0 + 1;
> > +new1 = orig1 + 2;
> > +
> > +r0 = orig0;
> > +r1 = orig1;
> > +r2 = new0;
> > +r3 = new1;
> > +asm("cdsg %[r0],%[r2],%[db2]\n"
> > +"ipm %[cc]"
> > +: [r0] "+r" (r0)
> > +, [r1] "+r" (r1)
> > +, [db2] "=m" (val)
> > +, [cc] "=r" (cc)
> > +: [r2] "r" (r2)
> > +, [r3] "r" (r3)
> > +: "cc");
> 
> Nit: I'd suggest a simple cdsg helper function that makes this code easier
> to digest.

Ok.

> 
> > +orig0 = r0;
> > +orig1 = r1;
> > +cc = (cc >> 28) & 3;
> > +
> > +if (cc == 0) {
> > +orig0 = new0;
> > +orig1 = new1;
> > +i++;
> > +} else {
> > +assert(cc == 1);
> > +}
> > +}
> > +
> > +return NULL;
> > +}
> > +
> > +int main(void)
> > +{
> > +pthread_t thread;
> > +int ret;
> > +
> > +ret = pthread_create(, NULL, cdsg_loop, NULL);
> > +assert(ret == 0);
> > +start = true;
> > +cdsg_loop(NULL);
> > +ret = pthread_join(thread, NULL);
> > +assert(ret == 0);
> > +
> > +assert(val[0] == 2000);
> > +assert(val[1] == 4000);
> > +
> > +return EXIT_SUCCESS;
> > +}
> 
> -- 
> Thanks,
> 
> David / dhildenb
> 
>

[PATCH] tests/tcg/s390x: Add sam.S

2022-11-28 Thread Ilya Leoshkevich

Add a small test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/s390x/Makefile.softmmu-target |  1 +
 tests/tcg/s390x/sam.S   | 67 +
 2 files changed, 68 insertions(+)
 create mode 100644 tests/tcg/s390x/sam.S

diff --git a/tests/tcg/s390x/Makefile.softmmu-target 
b/tests/tcg/s390x/Makefile.softmmu-target
index a34fa68473e..738b04530fc 100644
--- a/tests/tcg/s390x/Makefile.softmmu-target
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -7,3 +7,4 @@ QEMU_OPTS=-action panic=exit-failure -kernel
-Wl,--build-id=none $< -o $@
 
 TESTS += unaligned-lowcore
+TESTS += sam
diff --git a/tests/tcg/s390x/sam.S b/tests/tcg/s390x/sam.S
new file mode 100644
index 000..4cab2dd2007
--- /dev/null
+++ b/tests/tcg/s390x/sam.S
@@ -0,0 +1,67 @@
+/* DAT on, home-space mode, 64-bit mode */
+#define DAT_PSWM 0x400c0018000
+#define VIRTUAL_BASE 0x123456789abcd000
+
+.org 0x8e
+program_interruption_code:
+.org 0x150
+program_old_psw:
+.org 0x1d0 /* program new PSW */
+.quad 0,pgm_handler
+.org 0x200 /* lowcore padding */
+
+.globl _start
+_start:
+lctlg %c13,%c13,hasce
+lpswe dat_psw
+start_dat:
+sam24
+sam24_suppressed:
+/* sam24 should fail */
+fail:
+basr %r12,%r0
+lpswe failure_psw-.(%r12)
+pgm_handler:
+chhsi program_interruption_code,6  /* specification exception? */
+jne fail
+clc suppressed_psw(16),program_old_psw  /* correct location? */
+jne fail
+lpswe success_psw
+
+.align 8
+dat_psw:
+.quad DAT_PSWM,VIRTUAL_BASE+start_dat
+suppressed_psw:
+.quad DAT_PSWM,VIRTUAL_BASE+sam24_suppressed
+success_psw:
+.quad 0x2,0xfff/* see is_special_wait_psw() */
+failure_psw:
+.quad 0x2,0/* disabled wait */
+hasce:
+/* DT = 0b11 (region-first-table), TL = 3 (2k entries) */
+.quad region_first_table + (3 << 2) + 3
+.align 0x1000
+region_first_table:
+.org region_first_table + ((VIRTUAL_BASE >> 53) & 0x7ff) * 8
+/* TT = 0b11 (region-first-table), TL = 3 (2k entries) */
+.quad region_second_table + (3 << 2) + 3
+.org region_first_table + 0x800 * 8
+region_second_table:
+.org region_second_table + ((VIRTUAL_BASE >> 42) & 0x7ff) * 8
+/* TT = 0b10 (region-second-table), TL = 3 (2k entries) */
+.quad region_third_table + (2 << 2) + 3
+.org region_second_table + 0x800 * 8
+region_third_table:
+.org region_third_table + ((VIRTUAL_BASE >> 31) & 0x7ff) * 8
+/* TT = 0b01 (region-third-table), TL = 3 (2k entries) */
+.quad segment_table + (1 << 2) + 3
+.org region_third_table + 0x800 * 8
+segment_table:
+.org segment_table + ((VIRTUAL_BASE >> 20) & 0x7ff) * 8
+/* TT = 0b00 (segment-table) */
+.quad page_table
+.org segment_table + 0x800 * 8
+page_table:
+.org page_table + ((VIRTUAL_BASE >> 12) & 0xff) * 8
+.quad 0
+.org page_table + 0x100 * 8
-- 
2.38.1

Re: [PATCH 12/26] target/s390x: Move masking of psw.addr to cpu_get_tb_cpu_state

2022-11-28 Thread Ilya Leoshkevich

On Sat, Nov 05, 2022 at 09:27:07AM +1100, Richard Henderson wrote:
> On 11/4/22 00:42, Ilya Leoshkevich wrote:
> > On Wed, Oct 05, 2022 at 08:44:07PM -0700, Richard Henderson wrote:
> > > Masking after the fact in s390x_tr_init_disas_context
> > > provides incorrect information to tb_lookup.
> > > 
> > > Signed-off-by: Richard Henderson 
> > > ---
> > >   target/s390x/cpu.h   | 13 +++--
> > >   target/s390x/tcg/translate.c |  6 --
> > >   2 files changed, 7 insertions(+), 12 deletions(-)
> > 
> > How can we end up in a situation where this matters? E.g. if we are in
> > 64-bit mode and execute
> > 
> >  0xa12345678: sam31
> > 
> > we will get a specification exception, and cpu_get_tb_cpu_state() will
> > not run. And for valid 31-bit addresses masking should be a no-op.
> 
> Ah, true.  I was mislead by the presence of the code in
> s390x_tr_init_disas_context. Perhaps a tcg_debug_assert or just a comment?

An assert sounds good to me.
I tried the following diff with the attached test and it worked:

--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -390,7 +390,12 @@ static inline void cpu_get_tb_cpu_state(CPUS390XState* 
env, target_ulong *pc,
 }
 *pflags = flags;
 *cs_base = env->ex_value;
-*pc = (flags & FLAG_MASK_64 ? env->psw.addr : env->psw.addr & 0x7fff);
+if (!(flags & FLAG_MASK_32)) {
+g_assert(env->psw.addr <= 0xff);
+} else if (!(flags & FLAG_MASK_64)) {
+g_assert(env->psw.addr <= 0x7fff);
+}
+*pc = env->psw.addr;
 }
 
 /* PER bits from control register 9 */
diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index 24dc57a8816..a50453dd0d4 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -6464,6 +6464,12 @@ static void s390x_tr_init_disas_context(DisasContextBase 
*dcbase, CPUState *cs)
 {
 DisasContext *dc = container_of(dcbase, DisasContext, base);
 
+if (!(dc->base.tb->flags & FLAG_MASK_32)) {
+tcg_debug_assert(dc->base.pc_first <= 0xff);
+} else if (!(dc->base.tb->flags & FLAG_MASK_64)) {
+tcg_debug_assert(dc->base.pc_first <= 0x7fff);
+}
+
 dc->pc_save = dc->base.pc_first;
 dc->cc_op = CC_OP_DYNAMIC;
 dc->ex_value = dc->base.tb->cs_base;

[PATCH] tests/tcg/s390x: Add cdsg.c

2022-11-28 Thread Ilya Leoshkevich

Add a simple test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/s390x/Makefile.target |  4 ++
 tests/tcg/s390x/cdsg.c  | 73 +
 2 files changed, 77 insertions(+)
 create mode 100644 tests/tcg/s390x/cdsg.c

diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target
index 1d454270c0e..523214dac33 100644
--- a/tests/tcg/s390x/Makefile.target
+++ b/tests/tcg/s390x/Makefile.target
@@ -27,6 +27,7 @@ TESTS+=noexec
 TESTS+=div
 TESTS+=clst
 TESTS+=long-double
+TESTS+=cdsg
 
 Z13_TESTS=vistr
 $(Z13_TESTS): CFLAGS+=-march=z13 -O2
@@ -66,3 +67,6 @@ sha512-mvx: sha512.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $< -o $@ $(LDFLAGS)
 
 TESTS+=sha512-mvx
+
+cdsg: CFLAGS+=-pthread
+cdsg: LDFLAGS+=-pthread
diff --git a/tests/tcg/s390x/cdsg.c b/tests/tcg/s390x/cdsg.c
new file mode 100644
index 000..83313699f7d
--- /dev/null
+++ b/tests/tcg/s390x/cdsg.c
@@ -0,0 +1,73 @@
+#include 
+#include 
+#include 
+#include 
+
+static volatile bool start;
+static unsigned long val[2] __attribute__((__aligned__(16)));
+
+void *cdsg_loop(void *arg)
+{
+unsigned long orig0, orig1, new0, new1;
+register unsigned long r0 asm("r0");
+register unsigned long r1 asm("r1");
+register unsigned long r2 asm("r2");
+register unsigned long r3 asm("r3");
+int cc;
+int i;
+
+while (!start) {
+}
+
+orig0 = val[0];
+orig1 = val[1];
+for (i = 0; i < 1000;) {
+new0 = orig0 + 1;
+new1 = orig1 + 2;
+
+r0 = orig0;
+r1 = orig1;
+r2 = new0;
+r3 = new1;
+asm("cdsg %[r0],%[r2],%[db2]\n"
+"ipm %[cc]"
+: [r0] "+r" (r0)
+, [r1] "+r" (r1)
+, [db2] "=m" (val)
+, [cc] "=r" (cc)
+: [r2] "r" (r2)
+, [r3] "r" (r3)
+: "cc");
+orig0 = r0;
+orig1 = r1;
+cc = (cc >> 28) & 3;
+
+if (cc == 0) {
+orig0 = new0;
+orig1 = new1;
+i++;
+} else {
+assert(cc == 1);
+}
+}
+
+return NULL;
+}
+
+int main(void)
+{
+pthread_t thread;
+int ret;
+
+ret = pthread_create(, NULL, cdsg_loop, NULL);
+assert(ret == 0);
+start = true;
+cdsg_loop(NULL);
+ret = pthread_join(thread, NULL);
+assert(ret == 0);
+
+assert(val[0] == 2000);
+assert(val[1] == 4000);
+
+return EXIT_SUCCESS;
+}
-- 
2.38.1

Re: [PATCH for-8.0 v2 13/13] target/s390x: Implement CC_OP_NZ in gen_op_calc_cc

2022-11-28 Thread Ilya Leoshkevich

On Fri, Nov 11, 2022 at 06:08:20PM +1000, Richard Henderson wrote:
> This case is trivial to implement inline.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c | 3 +++
>  1 file changed, 3 insertions(+)

Acked-by: Ilya Leoshkevich

Re: [PATCH for-8.0 v2 12/13] target/s390x: Use tcg_gen_atomic_cmpxchg_i128 for CDSG

2022-11-28 Thread Ilya Leoshkevich

On Fri, Nov 11, 2022 at 06:08:19PM +1000, Richard Henderson wrote:
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/helper.h|  2 --
>  target/s390x/tcg/mem_helper.c| 52 ---
>  target/s390x/tcg/translate.c | 60 
>  target/s390x/tcg/insn-data.h.inc |  2 +-
>  4 files changed, 38 insertions(+), 78 deletions(-)

Acked-by: Ilya Leoshkevich 

I was wondering what assembly this would generate in parallel mode and
wrote a small test. On my x86_64 machine it ended up being
helper_atomic_cmpxchgo_be() -> cpu_atomic_cmpxchgo_be_mmu() ->
lock cmpxchg16b, nothing surprising.

On an s390x host we fall back to cpu_exec_step_atomic(), because in the
configure test:

  int main(void)
  {
unsigned __int128 x = 0, y = 0;
__sync_val_compare_and_swap_16(, y, x);
return 0;
  }

x and y are not aligned. I guess that's working as intended as well,
even though it would be nice to eventually make use of cdsg there.

I will post the test shortly.

Re: [PATCH for-8.0 v2 11/13] target/s390x: Use Int128 for passing float128

2022-11-28 Thread Ilya Leoshkevich

On Fri, Nov 11, 2022 at 06:08:18PM +1000, Richard Henderson wrote:
> Signed-off-by: Richard Henderson 
> ---
> v2: Fix SPEC_in1_x1.
> ---
>  target/s390x/helper.h| 32 ++--
>  target/s390x/tcg/fpu_helper.c| 88 ++--
>  target/s390x/tcg/translate.c | 76 ++-
>  target/s390x/tcg/insn-data.h.inc | 30 +--
>  4 files changed, 121 insertions(+), 105 deletions(-)

Acked-by: Ilya Leoshkevich

Re: [PATCH for-8.0 v2 10/13] target/s390x: Use Int128 for returning float128

2022-11-28 Thread Ilya Leoshkevich

On Fri, Nov 11, 2022 at 06:08:17PM +1000, Richard Henderson wrote:
> Reviewed-by: Philippe Mathieu-Daudé 
> Signed-off-by: Richard Henderson 
> ---
> v2: Remove extraneous return_low128.
> ---
>  target/s390x/helper.h| 22 +++---
>  target/s390x/tcg/fpu_helper.c| 29 +-
>  target/s390x/tcg/translate.c | 51 +---
>  target/s390x/tcg/insn-data.h.inc | 20 ++---
>  4 files changed, 63 insertions(+), 59 deletions(-)

Acked-by: Ilya Leoshkevich

Re: [PATCH for-8.0 v2 04/13] target/s390x: Use a single return for helper_divs32/u32

2022-11-28 Thread Ilya Leoshkevich

On Fri, Nov 11, 2022 at 06:08:11PM +1000, Richard Henderson wrote:
> Pack the quotient and remainder into a single uint64_t.
> 
> Signed-off-by: Richard Henderson 
> ---
> v2: Fix operand ordering; use tcg_extr32_i64.
> ---
>  target/s390x/helper.h |  2 +-
>  target/s390x/tcg/int_helper.c | 26 +-
>  target/s390x/tcg/translate.c  |  8 
>  3 files changed, 18 insertions(+), 18 deletions(-)

Acked-by: Ilya Leoshkevich

[PATCH v2 1/1] tcg: add perfmap and jitdump

2022-11-14 Thread Ilya Leoshkevich

Add ability to dump /tmp/perf-.map and jit-.dump.
The first one allows the perf tool to map samples to each individual
translation block. The second one adds the ability to resolve symbol
names, line numbers and inspect JITed code.

Example of use:

perf record qemu-x86_64 -perfmap ./a.out
perf report

or

perf record -k 1 qemu-x86_64 -jitdump ./a.out
perf inject -j -i perf.data -o perf.data.jitted
perf report -i perf.data.jitted

Co-developed-by: Vanderson M. do Rosario 
Co-developed-by: Alex Bennée 
Signed-off-by: Ilya Leoshkevich 
Message-Id: <20221012051846.1432050-2-...@linux.ibm.com>
---
 accel/tcg/debuginfo.c |  99 +++
 accel/tcg/debuginfo.h |  52 ++
 accel/tcg/meson.build |   2 +
 accel/tcg/perf.c  | 334 ++
 accel/tcg/perf.h  |  28 
 accel/tcg/translate-all.c |   8 +
 docs/devel/tcg.rst|  23 +++
 hw/core/loader.c  |   5 +
 include/tcg/tcg.h |   4 +-
 linux-user/elfload.c  |   3 +
 linux-user/exit.c |   2 +
 linux-user/main.c |  15 ++
 linux-user/meson.build|   1 +
 meson.build   |   8 +
 qemu-options.hx   |  20 +++
 softmmu/vl.c  |  11 ++
 tcg/region.c  |   2 +-
 tcg/tcg.c |   2 +
 18 files changed, 616 insertions(+), 3 deletions(-)
 create mode 100644 accel/tcg/debuginfo.c
 create mode 100644 accel/tcg/debuginfo.h
 create mode 100644 accel/tcg/perf.c
 create mode 100644 accel/tcg/perf.h

diff --git a/accel/tcg/debuginfo.c b/accel/tcg/debuginfo.c
new file mode 100644
index 000..c312db77146
--- /dev/null
+++ b/accel/tcg/debuginfo.c
@@ -0,0 +1,99 @@
+/*
+ * Debug information support.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/lockable.h"
+
+#include 
+
+#include "debuginfo.h"
+
+static QemuMutex lock;
+static Dwfl *dwfl;
+static const Dwfl_Callbacks dwfl_callbacks = {
+.find_elf = NULL,
+.find_debuginfo = dwfl_standard_find_debuginfo,
+.section_address = NULL,
+.debuginfo_path = NULL,
+};
+
+__attribute__((constructor))
+static void debuginfo_init(void)
+{
+qemu_mutex_init();
+}
+
+bool debuginfo_report_elf(const char *image_name, int image_fd,
+  unsigned long long load_bias)
+{
+QEMU_LOCK_GUARD();
+
+if (dwfl == NULL) {
+dwfl = dwfl_begin(_callbacks);
+} else {
+dwfl_report_begin_add(dwfl);
+}
+
+if (dwfl == NULL) {
+return false;
+}
+
+dwfl_report_elf(dwfl, image_name, image_name, image_fd, load_bias, true);
+dwfl_report_end(dwfl, NULL, NULL);
+return true;
+}
+
+bool debuginfo_get_symbol(unsigned long long address,
+  const char **symbol, unsigned long long *offset)
+{
+Dwfl_Module *dwfl_module;
+GElf_Off dwfl_offset;
+GElf_Sym dwfl_sym;
+
+QEMU_LOCK_GUARD();
+
+if (dwfl == NULL) {
+return false;
+}
+
+dwfl_module = dwfl_addrmodule(dwfl, address);
+if (dwfl_module == NULL) {
+return false;
+}
+
+*symbol = dwfl_module_addrinfo(dwfl_module, address, _offset,
+   _sym, NULL, NULL, NULL);
+if (*symbol == NULL) {
+return false;
+}
+*offset = dwfl_offset;
+return true;
+}
+
+bool debuginfo_get_line(unsigned long long address,
+const char **file, int *line)
+{
+Dwfl_Module *dwfl_module;
+Dwfl_Line *dwfl_line;
+
+QEMU_LOCK_GUARD();
+
+if (dwfl == NULL) {
+return false;
+}
+
+dwfl_module = dwfl_addrmodule(dwfl, address);
+if (dwfl_module == NULL) {
+return false;
+}
+
+dwfl_line = dwfl_module_getsrc(dwfl_module, address);
+if (dwfl_line == NULL) {
+return false;
+}
+*file = dwfl_lineinfo(dwfl_line, NULL, line, 0, NULL, NULL);
+return true;
+}
diff --git a/accel/tcg/debuginfo.h b/accel/tcg/debuginfo.h
new file mode 100644
index 000..d41d9d8d9b4
--- /dev/null
+++ b/accel/tcg/debuginfo.h
@@ -0,0 +1,52 @@
+/*
+ * Debug information support.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef ACCEL_TCG_DEBUGINFO_H
+#define ACCEL_TCG_DEBUGINFO_H
+
+#if defined(CONFIG_TCG) && defined(CONFIG_LIBDW)
+/*
+ * Load debuginfo for the specified guest ELF image.
+ * Return true on success, false on failure.
+ */
+bool debuginfo_report_elf(const char *image_name, int image_fd,
+  unsigned long long load_bias);
+
+/*
+ * Find a symbol name associated with the specified guest PC.
+ * Return true on success, false if there is no associated symbol.
+ */
+bool debuginfo_get_symbol(unsigned long long address,
+  const char **symbol, unsigned long long *offset);
+
+/*
+ * Find a line number associated with the specified guest PC.
+ * Return true on success, false if there is no associated line number.
+ */
+bool d

Re: [PATCH] s390x: Fix spelling errors

2022-11-14 Thread Ilya Leoshkevich

On Fri, Nov 11, 2022 at 07:28:28PM +0100, Thomas Huth wrote:
> Fix typos (discovered with the 'codespell' utility).
> 
> Signed-off-by: Thomas Huth 
> ---
>  hw/s390x/ipl.h  | 2 +-
>  pc-bios/s390-ccw/cio.h  | 2 +-
>  pc-bios/s390-ccw/iplb.h | 2 +-
>  target/s390x/cpu_models.h   | 4 ++--
>  hw/s390x/s390-pci-vfio.c| 2 +-
>  hw/s390x/s390-virtio-ccw.c  | 6 +++---
>  target/s390x/ioinst.c   | 2 +-
>  target/s390x/tcg/excp_helper.c  | 2 +-
>  target/s390x/tcg/fpu_helper.c   | 2 +-
>  target/s390x/tcg/misc_helper.c  | 2 +-
>  target/s390x/tcg/translate.c| 4 ++--
>  target/s390x/tcg/translate_vx.c.inc | 6 +++---
>  pc-bios/s390-ccw/start.S| 2 +-
>  13 files changed, 19 insertions(+), 19 deletions(-)

Reviewed-by: Ilya Leoshkevich

[PATCH v2 0/1] tcg: add perfmap and jitdump

2022-11-14 Thread Ilya Leoshkevich

v1:
https://lists.nongnu.org/archive/html/qemu-devel/2022-10/msg01824.html
https://lists.nongnu.org/archive/html/qemu-devel/2022-11/msg01073.html

v1 -> v2:
* Use QEMU_LOCK_GUARD (Alex).
* Handle TARGET_TB_PCREL (Alex).
* Support ELF -kernels, add a note about this (Alex). Tested with
  qemu-system-x86_64 and Linux kernel - it's not fast, but it works.
* Minor const correctness and style improvements.

Ilya Leoshkevich (1):
  tcg: add perfmap and jitdump

 accel/tcg/debuginfo.c |  99 +++
 accel/tcg/debuginfo.h |  52 ++
 accel/tcg/meson.build |   2 +
 accel/tcg/perf.c  | 334 ++
 accel/tcg/perf.h  |  28 
 accel/tcg/translate-all.c |   8 +
 docs/devel/tcg.rst|  23 +++
 hw/core/loader.c  |   5 +
 include/tcg/tcg.h |   4 +-
 linux-user/elfload.c  |   3 +
 linux-user/exit.c |   2 +
 linux-user/main.c |  15 ++
 linux-user/meson.build|   1 +
 meson.build   |   8 +
 qemu-options.hx   |  20 +++
 softmmu/vl.c  |  11 ++
 tcg/region.c  |   2 +-
 tcg/tcg.c |   2 +
 18 files changed, 616 insertions(+), 3 deletions(-)
 create mode 100644 accel/tcg/debuginfo.c
 create mode 100644 accel/tcg/debuginfo.h
 create mode 100644 accel/tcg/perf.c
 create mode 100644 accel/tcg/perf.h

-- 
2.37.2

Re: [PATCH 18/26] target/s390x: Split per_branch

2022-11-03 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:13PM -0700, Richard Henderson wrote:
> Split into per_branch_dest and per_branch_disp, which can be
> used for direct and indirect.  In preperation for TARGET_TB_PCREL,
> call per_branch_* before indirect branches.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c | 32 ++--
>  1 file changed, 18 insertions(+), 14 deletions(-)
> 
> diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
> index 712f6d5795..bd2ee1c96e 100644
> --- a/target/s390x/tcg/translate.c
> +++ b/target/s390x/tcg/translate.c
> @@ -350,21 +350,25 @@ static inline bool per_enabled(DisasContext *s)
>  #endif
>  }
>  
> -static void per_branch(DisasContext *s, bool to_next)
> +static void per_branch_dest(DisasContext *s, TCGv_i64 dest)
>  {
>  #ifndef CONFIG_USER_ONLY
>  gen_psw_addr_disp(s, gbea, 0);
> +if (s->base.tb->flags & FLAG_MASK_PER) {
> +gen_helper_per_branch(cpu_env, gbea, dest);
> +}
> +#endif
> +}
>  
> -if (per_enabled(s)) {
> -if (to_next) {
> -TCGv_i64 next_pc = tcg_temp_new_i64();
> -
> -gen_psw_addr_disp(s, next_pc, s->ilen);
> -gen_helper_per_branch(cpu_env, gbea, next_pc);
> -tcg_temp_free_i64(next_pc);
> -} else {
> -gen_helper_per_branch(cpu_env, gbea, psw_addr);
> -}
> +static void per_branch_disp(DisasContext *s, int64_t disp)
> +{
> +#ifndef CONFIG_USER_ONLY
> +gen_psw_addr_disp(s, gbea, 0);
> +if (s->base.tb->flags & FLAG_MASK_PER) {
> +TCGv_i64 dest = tcg_temp_new_i64();
> +gen_psw_addr_disp(s, dest, disp);
> +gen_helper_per_branch(cpu_env, gbea, dest);
> +tcg_temp_free_i64(dest);
>  }
>  #endif
>  }

...

Nit: maybe use per_enabled(s) instead of testing the mask manually?

Reviewed-by: Ilya Leoshkevich

Re: [PATCH 16/26] target/s390x: Disable conditional branch-to-next for PER

2022-11-03 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:11PM -0700, Richard Henderson wrote:
> For PER, we require a conditional call to helper_per_branch
> for the conditional branch.  Fold the remaining optimization
> into a call to helper_goto_direct, which will take care of
> the remaining gbea adjustment.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c | 13 ++---
>  1 file changed, 6 insertions(+), 7 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH 15/26] target/s390x: Introduce per_enabled

2022-11-03 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:10PM -0700, Richard Henderson wrote:
> Hoist the test of FLAG_MASK_PER to a helper.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c | 23 ---
>  1 file changed, 16 insertions(+), 7 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH 14/26] target/s390x: Don't set gbea for user-only

2022-11-03 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:09PM -0700, Richard Henderson wrote:
> The rest of the per_* functions have this ifdef;
> this one seemed to be missing.
> 
> Signed-off-by: Richard Henderson 

Reviewed-by: Ilya Leoshkevich

Re: [PATCH 13/26] target/s390x: Add disp argument to update_psw_addr

2022-11-03 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:08PM -0700, Richard Henderson wrote:
> Rename to update_psw_addr_disp at the same time.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c | 22 +++---
>  1 file changed, 11 insertions(+), 11 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH 12/26] target/s390x: Move masking of psw.addr to cpu_get_tb_cpu_state

2022-11-03 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:07PM -0700, Richard Henderson wrote:
> Masking after the fact in s390x_tr_init_disas_context
> provides incorrect information to tb_lookup.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/cpu.h   | 13 +++--
>  target/s390x/tcg/translate.c |  6 --
>  2 files changed, 7 insertions(+), 12 deletions(-)

How can we end up in a situation where this matters? E.g. if we are in
64-bit mode and execute

0xa12345678: sam31

we will get a specification exception, and cpu_get_tb_cpu_state() will
not run. And for valid 31-bit addresses masking should be a no-op.

Re: [PATCH 11/26] target/s390x: Use ilen instead in branches

2022-11-03 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:06PM -0700, Richard Henderson wrote:
> Remove the remaining uses of pc_tmp, and remove the variable.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c | 13 +++--
>  1 file changed, 3 insertions(+), 10 deletions(-)

Reviewed-by: Ilya Leoshkevich

Re: [PATCH 10/26] target/s390x: Use gen_psw_addr_disp in op_sam

2022-11-03 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:05PM -0700, Richard Henderson wrote:
> Complicated because we may now require a runtime jump.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c | 40 +---
>  1 file changed, 28 insertions(+), 12 deletions(-)

Reviewed-by: Ilya Leoshkevich

[PATCH] tests/tcg/s390x: Add bal.S

2022-11-03 Thread Ilya Leoshkevich

Add a small test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/s390x/Makefile.softmmu-target |  1 +
 tests/tcg/s390x/bal.S   | 24 
 2 files changed, 25 insertions(+)
 create mode 100644 tests/tcg/s390x/bal.S

diff --git a/tests/tcg/s390x/Makefile.softmmu-target 
b/tests/tcg/s390x/Makefile.softmmu-target
index a34fa68473e..295df084919 100644
--- a/tests/tcg/s390x/Makefile.softmmu-target
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -7,3 +7,4 @@ QEMU_OPTS=-action panic=exit-failure -kernel
-Wl,--build-id=none $< -o $@
 
 TESTS += unaligned-lowcore
+TESTS += bal
diff --git a/tests/tcg/s390x/bal.S b/tests/tcg/s390x/bal.S
new file mode 100644
index 000..e54d8874ff9
--- /dev/null
+++ b/tests/tcg/s390x/bal.S
@@ -0,0 +1,24 @@
+.org 0x200 /* lowcore padding */
+.globl _start
+_start:
+lpswe start24_psw
+_start24:
+lgrl %r0,initial_r0
+lgrl %r1,expected_r0
+bal %r0,0f
+0:
+cgrjne %r0,%r1,1f
+lpswe success_psw
+1:
+lpswe failure_psw
+.align 8
+start24_psw:
+.quad 0x1600,_start24  /* 24-bit mode, cc = 1, pm = 6 */
+initial_r0:
+.quad 0x1234567887654321
+expected_r0:
+.quad 0x123456789600 + 0b  /* ilc = 2, cc = 1, pm = 6 */
+success_psw:
+.quad 0x2,0xfff/* see is_special_wait_psw() */
+failure_psw:
+.quad 0x2,0/* disabled wait */
-- 
2.37.2

Re: [PATCH 09/26] target/s390x: Use gen_psw_addr_disp in save_link_info

2022-11-03 Thread Ilya Leoshkevich

On Wed, Oct 05, 2022 at 08:44:04PM -0700, Richard Henderson wrote:
> Trivial but non-mechanical conversion away from pc_tmp.
> 
> Reviewed-by: Philippe Mathieu-Daudé 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/tcg/translate.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)

I was a bit worried about this sequence and wrote a regression test.
It did not find any issues, but I will post it here, might prove useful
some day.

Reviewed-by: Ilya Leoshkevich

< 2 3 4 5 6 7 8 9 10 >

601 - 700 of 967 matches

Mail list logo