wingo pushed a commit to branch wip-whippet in repository guile. commit 367e04f164a5908ab98c9a0c4cd1d8210a4cdae4 Author: Andy Wingo <wi...@igalia.com> AuthorDate: Fri Feb 14 12:30:40 2025 +0100
Add documentation on tracepoints Also clean up how-to-build documentation --- ctf_to_json.py | 160 ++++++++++++++++++++++++++++++++++ doc/manual.md | 214 ++++++++++++---------------------------------- doc/perfetto-minor-gc.png | Bin 0 -> 173475 bytes doc/tracepoints.md | 126 +++++++++++++++++++++++++++ 4 files changed, 340 insertions(+), 160 deletions(-) diff --git a/ctf_to_json.py b/ctf_to_json.py new file mode 100755 index 000000000..f6b7f429a --- /dev/null +++ b/ctf_to_json.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +# Any copyright is dedicated to the Public Domain. +# https://creativecommons.org/publicdomain/zero/1.0/ +# +# Originally written by Andy Wingo <wi...@igalia.com>. + +import bt2 # From the babeltrace2 package. +import sys +import json +from enum import Enum + +# Usage: ./ctf_to_json.py ~/lttng-traces/name-of-your-trace > foo.json +# +# Convert a Common Trace Format (CTF) trace, for example as produced by +# LTTng, to the JSON-based Trace Event Format (TEF), for example as +# consumed by `chrome://tracing`, `https://ui.perfetto.dev/`, or +# `https://profiler.firefox.com`. + +# The Trace Event Format is documented here: +# +# https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0 + +# By default, events are emitted as EventPhase.INSTANT. We also support +# rewriting the event stream so as to generate EventPhase.BEGIN / +# EventPhase.END events for specific named events. + +synthetic_events = { + 'gc': ['whippet:mutator_cause_gc', + 'whippet:restarting_mutators'], + 'stop-the-world': ['whippet:requesting_stop', + 'whippet:mutators_stopped'], + 'trace': ['whippet:prepare_gc', + 'whippet:restarting_mutators'], + 'mutator-stopped': ['whippet:mutator_stopping', + 'whippet:mutator_restarted'], + 'trace-roots': ['whippet:trace_roots_begin', + 'whippet:trace_roots_end'], + 'trace-check-termination': ['whippet:trace_check_termination_begin', + 'whippet:trace_check_termination_end'], + 'trace-objects': ['whippet:trace_objects_begin', + 'whippet:trace_objects_end'], + 'trace-worker': ['whippet:trace_worker_begin', + 'whippet:trace_worker_end'] +} + +class EventPhase(Enum): + BEGIN = 'B' + END = 'E' + COMPLETE = 'X' + INSTANT = 'i' + COUNTER = 'C' + NESTABLE_START = 'b' + NESTABLE_INSTANT = 'n' + NESTABLE_END = 'e' + FLOW_START = 's' + FLOW_STEP = 't' + FLOW_END = 'f' + SAMPLE = 'P' + OBJECT_CREATED = 'N' + OBJECT_SNAPSHOT = 'O' + OBJECT_DESTROYED = 'D' + METADATA = 'M' + MEMORY_DUMP_GLOBAL = 'V' + MEMORY_DUMP_PROCESS = 'V' + MARK = 'R' + CLOCK_SYNC = 'c' + CONTEXT_BEGIN = '(' + CONTEXT_END = ')' + +base_time = None +def event_us(msg): + assert(msg.default_clock_snapshot.clock_class.name == 'monotonic') + assert(msg.default_clock_snapshot.clock_class.frequency == 1e9) + global base_time + ns = msg.default_clock_snapshot.value + if base_time is None: + base_time = ns + return (ns - base_time) * 1e-3 + +def lower(x): + if isinstance(x, str) or isinstance(x, int) or isinstance(x, float): + return x + if isinstance(x, dict) or isinstance(x, bt2._StructureFieldConst): + return {lower(k):lower(v) for k, v in x.items()} + if isinstance(x, bt2._BoolValueConst) or isinstance(x, bt2._BoolFieldConst): + return bool(x) + if isinstance(x, bt2._EnumerationFieldConst): + return repr(x) + if isinstance(x, bt2._IntegerValueConst) or isinstance(x, bt2._IntegerFieldConst): + return int(x) + if isinstance(x, bt2._RealValueConst) or isinstance(x, bt2._RealFieldConst): + return float(x) + if isinstance(x, bt2._StringValueConst) or isinstance(x, bt2._StringFieldConst): + return str(x) + raise ValueError("Unexpected value from trace", x) + +# Specific Whippet events. +synthetic_begin = {} +synthetic_end = {} +for synthetic, [begin, end] in synthetic_events.items(): + synthetic_begin[begin] = [] + synthetic_end[end] = [] +for synthetic, [begin, end] in synthetic_events.items(): + synthetic_begin[begin].append(synthetic) + synthetic_end[end].append(synthetic) + +def put(str): + sys.stdout.write(str) + +need_comma = False +def print_event(ev): + global need_comma + if need_comma: + sys.stdout.write(',\n ') + else: + need_comma = True + # It appears to be faster to make a string, then print the string, + # than to call json.dump with a file object. + # json.dump(ev, sys.stdout, ensure_ascii=False, check_circular=False) + put(json.dumps(ev, ensure_ascii=False, check_circular=False)) + +def emit_event(msg, name, phase): + ev = {'name': name, + 'cat': 'whippet', + 'ph': phase.value, + 'ts': event_us(msg), + 'pid': lower(msg.event.common_context_field['vpid']), + 'tid': lower(msg.event.common_context_field['vtid']), + 'args': lower(msg.event.payload_field)} + print_event(ev) +def emit_begin_event(msg, name): + emit_event(msg, name, EventPhase.BEGIN) +def emit_end_event(msg, name): + emit_event(msg, name, EventPhase.END) + +def emit_events(msg): + emit_event(msg, msg.event.name, EventPhase.INSTANT) + for begin in synthetic_begin.get(msg.event.name, []): + emit_begin_event(msg, begin) + for end in synthetic_end.get(msg.event.name, []): + emit_end_event(msg, end) + +def ctf_to_json(path): + msg_it = bt2.TraceCollectionMessageIterator(path) + put('{\n') + put(' "traceEvents": [\n ') + for msg in msg_it: + if hasattr(msg, 'event'): + emit_events(msg) + put('\n') + put('\n ],\n') + put(' "displayTimeUnit": "ns"\n') + put('}\n') + +if len(sys.argv) != 2: + sys.stderr.write( + 'usage: ' + sys.argv[0] + ' ~/lttng-traces/name-of-your-trace\n') + sys.exit(1) +else: + ctf_to_json(sys.argv[1]) diff --git a/doc/manual.md b/doc/manual.md index 1ddfcb556..a6742cbe5 100644 --- a/doc/manual.md +++ b/doc/manual.md @@ -176,13 +176,14 @@ implementations of that API: `semi`, a simple semi-space collector; collector; and `mmc`, a mostly-marking collector inspired by Immix. The program that embeds Whippet selects the collector implementation at -build-time. In the case of the `mmc` collector, the program -also configures a specific collector mode, again at build-time: -generational or not, parallel or not, stack-conservative or not, and -heap-conservative or not. It may be nice in the future to be able to -configure these at run-time, but for the time being they are -compile-time options so that adding new features doesn't change the -footprint of a more minimal collector. +build-time. For `pcc`, the program can also choose whether to be +generational or not. For `mmc` collector, the program configures a +specific collector mode, again at build-time: generational or not, +parallel or not, stack-conservative or not, and heap-conservative or +not. It may be nice in the future to be able to configure these at +run-time, but for the time being they are compile-time options so that +adding new features doesn't change the footprint of a more minimal +collector. Different collectors have different allocation strategies: for example, the BDW collector allocates from thread-local freelists, whereas the @@ -199,97 +200,58 @@ compiling user code. ### Compiling the collector -Building the collector is not as easy as it should be. As an embed-only -library, we don't get to choose the One True Build System and then just -build the software in that way; instead Whippet needs to be buildable -with any build system. At some point we will have snippets that -embedders can include in their various build systems, but for now we -document the low-level structure, so that people can craft the -appropriate incantations for their program's build system. - -Whippet consists of some collector-implementation-agnostic independent -modules, and then the collector implementation itself. Though Whippet -tries to put performance-sensitive interfaces in header files, users -should also compile with link-time optimization (LTO) to remove any -overhead imposed by the division of code into separate compilation -units. - -Usually you want to build with maximum optimization and no debugging -assertions. Sometimes you want minimal optimization and all assertions. -Here's what we do, as a `Makefile` snippet: +As an embed-only library, Whippet needs to be integrated into the build +system of its host (embedder). Currently the only supported build +system uses GNU make. We would be happy to add other systems over time. -``` -DEFAULT_BUILD=opt -BUILD_CFLAGS_opt=-O2 -g -DNDEBUG -BUILD_CFLAGS_optdebug=-Og -g -DGC_DEBUG=1 -BUILD_CFLAGS_debug=-O0 -g -DGC_DEBUG=1 -BUILD_CFLAGS=$(BUILD_CFLAGS_$(or $(BUILD),$(DEFAULT_BUILD))) -``` - -So if you do just plain `make`, it will do an `opt` build. You can -specify the build mode by setting `BUILD` on the command line, as in -`make BUILD=debug`. +At a high level, first the embedder chooses a collector and defines how +to specialize the collector against the embedder. Whippet's `embed.mk` +Makefile snippet then defines how to build the set of object files that +define the collector, and how to specialize the embedder against the +chosen collector. -Then for the actual compilation flags, we do: +As an example, say you have a file `program.c`, and you want to compile +it against a Whippet checkout in `whippet/`. Your headers are in +`include/`, and you have written an implementation of the embedder +interface in `host-gc.h`. In that case you would have a Makefile like +this: ``` -CC=gcc -CFLAGS=-Wall -flto -fno-strict-aliasing -fvisibility=hidden -Wno-unused $(BUILD_CFLAGS) -INCLUDES=-I. -LDFLAGS=-lpthread -flto -COMPILE=$(CC) $(CFLAGS) $(INCLUDES) -``` +HOST_DIR:=$(dir $(lastword $(MAKEFILE_LIST))) +WHIPPET_DIR=$(HOST_DIR)whippet/ -The actual include directory (the dot in `-I.`) should be adjusted as -appropriate. +all: out -#### Collector-implementation-agnostic independent modules +# The collector to choose: e.g. semi, bdw, pcc, generational-pcc, mmc, +# parallel-mmc, etc. +GC_COLLECTOR=pcc -There are currently four generic modules that don't depend on the choice -of collector. The first is `gc-stack.o`, which has supporting code to -associate mutators (threads) with slices of the native stack, in order -to support conservative root-finding. +include $(WHIPPET_DIR)embed.mk -``` -$(COMPILE) -o gc-stack.o -c gc-stack.c -``` +# Host cflags go here... +HOST_CFLAGS= -The next is a generic options interface, to allow the user to -parameterize the collector at run-time, for example to implement a -specific heap sizing strategy. +# Whippet's embed.mk uses this variable when it compiles code that +# should be specialized against the embedder. +EMBEDDER_TO_GC_CFLAGS=$(HOST_CFLAGS) -include $(HOST_DIR)host-gc.h +program.o: program.c + $(GC_COMPILE) $(HOST_CFLAGS) $(GC_TO_EMBEDDER_CFLAGS) -c $< +program: program.o $(GC_OBJS) + $(GC_LINK) $^ $(GC_LIBS) ``` -$(COMPILE) -o gc-options.o -c gc-options.c -``` - -Next, where Whippet needs to get data from the operating system, for -example the number of processors available, it does so behind an -abstract interface that is selected at compile-time. The only -implementation currently is for GNU/Linux, but it's a pretty thin layer, -so adding more systems should not be difficult. -``` -PLATFORM=gnu-linux -$(COMPILE) -o gc-platform.o -c gc-platform-$(PLATFORM).c -``` +The optimization settings passed to the C compiler are taken from +`GC_BUILD_CFLAGS`. Embedders can override this variable directly, or +via the shorthand `GC_BUILD` variable. A `GC_BUILD` of `opt` indicates +maximum optimization and no debugging assertions; `optdebug` adds +debugging assertions; and `debug` removes optimizations. -Finally, something a little more complicated: ephemerons. Ephemerons -are objects that make a weak association between a key and a value. As -first-class objects, they need to be classifiable by the user system, -and notably via the `gc_trace_object` procedure, and therefore need to -have a header whose shape is understandable by the embedding program. -We do this by including the `gc-embedder-api.h` implementation, via -`-include`, in this case providing `foo-embedder.h`: - -``` -$(COMPILE) -include foo-embedder.h -o gc-ephemeron.o -c gc-ephemeron.c -``` - -As for ephemerons, finalizers also have their own compilation unit. - -``` -$(COMPILE) -include foo-embedder.h -o gc-finalizer.o -c gc-finalizer.c -``` +Though Whippet tries to put performance-sensitive interfaces in header +files, users should also compile with link-time optimization (LTO) to +remove any overhead imposed by the division of code into separate +compilation units. `embed.mk` includes the necessary LTO flags in +`GC_CFLAGS` and `GC_LDFLAGS`. #### Compile-time options @@ -316,82 +278,14 @@ Some collectors require specific compile-time options. For example, the semi-space collector has to be able to move all objects; this is not compatible with conservative roots or heap edges. -#### Building `semi` - -Finally, let's build a collector. The simplest collector is the -semi-space collector. The entirety of the implementation can be had by -compiling `semi.c`, providing the program's embedder API implementation -via `-include`: - -``` -$(COMPILE) -DGC_PRECISE_ROOTS=1 -include foo-embedder.h -o gc.o -c semi.c -``` - -#### Building `bdw` - -The next simplest collector uses -[BDW-GC](https://github.com/ivmai/bdwgc). This collector must scan the -roots and heap conservatively. The collector is parallel if BDW-GC -itself was compiled with parallelism enabled. +#### Tracing support -``` -$(COMPILE) -DGC_CONSERVATIVE_ROOTS=1 -DGC_CONSERVATIVE_TRACE=1 \ - `pkg-config --cflags bdw-gc` \ - -include foo-embedder.h -o gc.o -c bdw.c -``` - -#### Building `pcc` - -The parallel copying collector is like `semi` but better in every way: -it supports multiple mutator threads, and evacuates in parallel if -multiple threads are available. - -``` -$(COMPILE) -DGC_PARALLEL=1 -DGC_PRECISE_ROOTS=1 \ - -include foo-embedder.h -o gc.o -c pcc.c -``` - -You can also build `pcc` in a generational configuration by passing -`-DGC_GENERATIONAL=1`. The nursery is 2 MB per active mutator, capped -to the number of processors, so if the last cycle had a maximum of 4 -mutator threads active at the same time and your machine has 24 cores, -your nursery would be 8 MB. - -#### Building `mmc` - -Finally, there is the mostly-marking collector. It can collect roots -precisely or conservatively, trace precisely or conservatively, be -parallel or not, and be generational or not. - -``` -$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 \ - -include foo-embedder.h -o gc.o -c mvv.c -``` - -### Compiling your program - -Any compilation unit that uses the GC API should have the same set of -compile-time options defined as when compiling the collector. -Additionally those compilation units should include the "attributes" -header for the collector in question, namely `semi-attrs.h`, -`bdw-attrs.h`, `pcc-attrs.h`, or `mmc-attrs.h`. For example, for -parallel generational mmc, you might have: - -``` -$(COMPILE) -DGC_PARALLEL=1 -DGC_GENERATIONAL=1 -DGC_PRECISE_ROOTS=1 \ - -include mmc-attrs.h -o my-program.o -c my-program.c -``` - -### Linking the collector into your program - -Finally to link, pass all objects to the linker. You will want to -ensure that the linker enables `-flto`, for link-time optimization. We -do it like this: - -``` -$(CC) $(LDFLAGS) -o my-program \ - my-program.o gc-stack.o gc-platform.o gc-options.o gc-ephemeron.o -``` +Whippet includes support for low-overhead run-time tracing via +[LTTng](https://lttng.org/). If the support library `lttng-ust` is +present when Whippet is compiled (as checked via `pkg-config`), +tracepoint support will be present. See +[tracepoints.md](./tracepoints.md) for more information on how to get +performance traces out of Whippet. ## Using the collector diff --git a/doc/perfetto-minor-gc.png b/doc/perfetto-minor-gc.png new file mode 100644 index 000000000..3c528ae42 Binary files /dev/null and b/doc/perfetto-minor-gc.png differ diff --git a/doc/tracepoints.md b/doc/tracepoints.md new file mode 100644 index 000000000..a6dcbcdfb --- /dev/null +++ b/doc/tracepoints.md @@ -0,0 +1,126 @@ +# Whippet performance tracing + +Whippet includes support for run-time tracing via +[LTTng](https://LTTng.org) user-space tracepoints. This allows you to +get a detailed look at how Whippet is performing on your system. +Tracing support is currently limited to Linux systems. + +## Getting started + +First, you need to build Whippet with LTTng support. Usually this is as +easy as building it in an environment where the `lttng-ust` library is +present, as determined by `pkg-config --libs lttng-ust`. You can know +if your Whippet has tracing support by seeing if the resulting binaries +are dynamically linked to `liblttng-ust`. + +If we take as an example the `mt-gcbench` test in the Whippet source +tree, we would have: + +``` +$ ldd bin/mt-gcbench.pcc | grep lttng +... +liblttng-ust.so.1 => ... +... +``` + +### Capturing traces + +Actually capturing traces is a little annoying; it's not as easy as +`perf run`. The [LTTng +documentation](https://lttng.org/docs/v2.13/#doc-controlling-tracing) is +quite thorough, but here is a summary. + +First, create your tracing session: + +``` +$ lttng create +Session auto-20250214-091153 created. +Traces will be output to $HOME/lttng-traces/auto-20250214-091153 +``` + +You run all these commands as your own user; they don't require root +permissions or system-wide modifications, as all of the Whippet +tracepoints are user-space tracepoints (UST). + +Just having an LTTng session created won't do anything though; you need +to configure the session. Monotonic nanosecond-resolution timestamps +are already implicitly part of each event. We also want to have process +and thread IDs for all events: + +``` +$ lttng add-context --userspace --type=vpid --type=vtid +ust context vpid added to all channels +ust context vtid added to all channels +``` + +Now enable Whippet events: + +``` +$ lttng enable-event --userspace 'whippet:*' +ust event whippet:* created in channel channel0 +``` + +And now, start recording: + +``` +$ lttng start +Tracing started for session auto-20250214-091153 +``` + +With this, traces will be captured for our program of interest: + +``` +$ bin/mt-gcbench.pcc 2.5 8 +... +``` + +Now stop the trace: + +``` +$ lttng stop +Waiting for data availability +Tracing stopped for session auto-20250214-091153 +``` + +Whew. If we did it right, our data is now in +$HOME/lttng-traces/auto-20250214-091153. + +### Visualizing traces + +LTTng produces traces in the [Common Trace Format +(CTF)](https://diamon.org/ctf/). My favorite trace viewing tool is the +family of web-based trace viewers derived from `chrome://tracing`. The +best of these appear to be [the Firefox +profiler](https://profiler.firefox.com) and +[Perfetto](https://ui.perfetto.dev). Unfortunately neither of these can +work with CTF directly, so we instead need to run a trace converter. + +Oddly, there is no trace converter that can read CTF and write something +that Perfetto (e.g.) can read. However there is a JSON-based tracing +format that Perfetto can read, and [Python bindings for Babeltrace, a +library that works with CTF](https://babeltrace.org/), so that's what we +do: + +``` +$ python3 ctf_to_json.py ~/lttng-traces/auto-20250214-091153 > trace.json +``` + +While Firefox Profiler can load this file, it works better on Perfetto, +as the Whippet events are visually rendered on their respective threads. + + + +### Expanding the set of events + +As of February 2025, +the current set of tracepoints includes the [heap +events](https://github.com/wingo/whippet/blob/main/doc/manual.md#statistics) +and some detailed internals of the parallel tracer. We expect this set +of tracepoints to expand over time. + +### Overhead of tracepoints + +When tracepoints are compiled in but no events are enabled, tracepoints +appear to have no impact on run-time. When event collection is on, for +x86-64 hardware, [emitting a tracepoint event takes about +100ns](https://discuss.systems/@DesnoyersMa/113986344940256872).