[Lldb-commits] [lldb] [lldb] Add synthetic support to formatter_bytecode.py (PR #183804)

Dave Lee via lldb-commits Fri, 27 Feb 2026 11:06:26 -0800

https://github.com/kastiglione created 
https://github.com/llvm/llvm-project/pull/183804


None

>From 8ab6a9e7acc6a00de32031947d65480f2f7f4a6c Mon Sep 17 00:00:00 2001
From: Dave Lee <[email protected]>
Date: Thu, 26 Feb 2026 13:31:49 -0800
Subject: [PATCH] [lldb] Add synthetic support to formatter_bytecode.py

---
 lldb/examples/python/formatter_bytecode.py | 345 +++++++++++++++++----
 1 file changed, 280 insertions(+), 65 deletions(-)

diff --git a/lldb/examples/python/formatter_bytecode.py 
b/lldb/examples/python/formatter_bytecode.py
index 3d0d4d4274de4..377f222ea170a 100644
--- a/lldb/examples/python/formatter_bytecode.py
+++ b/lldb/examples/python/formatter_bytecode.py
@@ -6,6 +6,10 @@
 """
 
 from __future__ import annotations
+import re
+from typing import BinaryIO, Iterable, Iterator, TextIO, Tuple, Union
+
+BINARY_VERSION = 1
 
 # Types
 type_String = 1
@@ -74,6 +78,19 @@ def define_opcode(n, mnemonic, name):
 sig_get_num_children = 2
 sig_get_child_index = 3
 sig_get_child_at_index = 4
+sig_update = 5
+
+SIGNATURES = {
+    "summary": sig_summary,
+    "init": sig_init,
+    "get_num_children": sig_get_num_children,
+    "get_child_index": sig_get_child_index,
+    "get_child_at_index": sig_get_child_at_index,
+    "update": sig_update,
+}
+
+SIGNATURE_NAMES = "|".join(SIGNATURES.keys())
+SIGNATURE_IDS = {v: k for k, v in SIGNATURES.items()}
 
 # Selectors
 selector = dict()
@@ -119,8 +136,82 @@ def define_selector(n, name):
 # Compiler.
 
################################################################################
 
+_SIGNATURE_LABEL = re.compile(f"@(?:{SIGNATURE_NAMES}):$")
+
+
+def _tokenize(assembler: str) -> list[str]:
+    """Convert string of assembly into tokens."""
+    # With one exception, tokens are simply sequences of non-space characters.
+    # The one exception is string literals, which may have spaces.
+
+    # To parse strings, which can contain escaped contents, use a "Friedl
+    # unrolled loop". The high level of such a regex is:
+    #     open normal* ( special normal* )* close
+    # which for string literals is:
+    string_literal = r'" [^"\\]* (?: \\. [^"\\]* )* "'
+
+    return re.findall(rf"{string_literal} | \S+", assembler, re.VERBOSE)
+
+
+def _segment_by_signature(input: list[str]) -> list[Tuple[str, list[str]]]:
+    """Segment the input tokens along signature labels."""
+    segments = []
+
+    # Loop state
+    signature = None
+    tokens = []
+
+    def conclude_segment():
+        if not tokens:
+            raise ValueError(f"empty signature: {signature}")
+        segments.append((signature, tokens))
+
+    for token in input:
+        if _SIGNATURE_LABEL.match(token):
+            if signature:
+                conclude_segment()
+            signature = token[1:-1]  # strip leading @, trailing :
+            tokens = []
+        else:
+            tokens.append(token)
+
+    if signature:
+        conclude_segment()
+
+    return segments
+
+
+def compile_file(type_name: str, input: TextIO, output: BinaryIO) -> None:
+    input_tokens = _tokenize(input.read())
+
+    signatures = {}
+    for sig, tokens in _segment_by_signature(input_tokens):
+        if sig in signatures:
+            raise ValueError(f"duplicate signature: {sig}")
+        signatures[sig] = compile_tokens(tokens)
+
+    # FIXME: review use of ints below, check if any are in fact uleb.
+    bin = bytearray()
+    bin.extend(_to_uleb(len(type_name)))
+    bin.extend(bytes(type_name, encoding="utf-8"))
+    flags = 0
+    bin.extend(_to_byte(flags))
+    for sig, bc in signatures.items():
+        bin.extend(_to_byte(SIGNATURES[sig]))
+        bin.extend(_to_uleb(len(bc)))
+        bin.extend(bc)
+
+    # FIXME: is version a uleb?
+    output.write(_to_byte(BINARY_VERSION))
+    output.write(_to_uleb(len(bin)))
+    output.write(bin)
+
 
 def compile(assembler: str) -> bytearray:
+    return compile_tokens(_tokenize(assembler))
+
+
+def compile_tokens(tokens: list[str]) -> bytearray:
     """Compile assembler into bytecode"""
     # This is a stack of all in-flight/unterminated blocks.
     bytecode = [bytearray()]
@@ -128,7 +219,6 @@ def compile(assembler: str) -> bytearray:
     def emit(byte):
         bytecode[-1].append(byte)
 
-    tokens = list(assembler.split(" "))
     tokens.reverse()
     while tokens:
         tok = tokens.pop()
@@ -152,27 +242,8 @@ def emit(byte):
             emit(op_lit_selector)
             emit(selector[tok])
         elif tok[0] == '"':
-            s = bytearray()
-            done = False
-            chrs = tok[1:]
-            while not done:
-                quoted = False
-                for c in chrs:
-                    if quoted:
-                        s.append(ord(c))  # FIXME
-                        quoted = False
-                    elif c == "\\":
-                        quoted = True
-                    elif c == '"':
-                        done = True
-                        break
-                        # FIXME assert this is last in token
-                    else:
-                        s.append(ord(c))
-                if not done:
-                    s.append(ord(" "))
-                    chrs = tokens.pop()
-
+            # Remove backslash escaping '"' and '\'.
+            s = re.sub(r'\\(["\\])', r"\1", tok[1:-1]).encode()
             emit(op_lit_string)
             emit(len(s))
             bytecode[-1].extend(s)
@@ -187,7 +258,34 @@ def emit(byte):
 
################################################################################
 
 
-def disassemble(bytecode: bytearray) -> (str, int):
+def disassemble_file(input: BinaryIO, output: TextIO) -> None:
+    import io
+
+    stream = io.BytesIO(input.read())
+
+    version = stream.read(1)[0]
+    if version != BINARY_VERSION:
+        raise ValueError(f"unknown binary version: {version}")
+
+    record_size = _from_uleb(stream)
+    stream.truncate(stream.tell() + record_size)
+
+    name_size = _from_uleb(stream)
+    _type_name = stream.read(name_size).decode()
+    _flags = stream.read(1)[0]
+
+    while True:
+        sig_byte = stream.read(1)
+        if not sig_byte:
+            break
+        sig_name = SIGNATURE_IDS[sig_byte[0]]
+        body_size = _from_uleb(stream)
+        bc = stream.read(body_size)
+        asm, _ = disassemble(bc)
+        print(f"@{sig_name}: {asm}", file=output)
+
+
+def disassemble(bytecode: Union[bytes, bytearray]) -> Tuple[str, list[int]]:
     """Disassemble bytecode into (assembler, token starts)"""
     asm = ""
     all_bytes = list(bytecode)
@@ -221,11 +319,14 @@ def next_byte():
             asm += selector[b]
         elif b == op_lit_string:
             length = next_byte()
-            s = "'"
-            while length:
-                s += chr(next_byte())
-                length -= 1
-            asm += '"' + repr(s)[2:]
+            s = '"'
+            for _ in range(length):
+                c = chr(next_byte())
+                if c in ('"', "\\"):
+                    s += "\\"
+                s += c
+            s += '"'
+            asm += s
         else:
             asm += opcode[b]
 
@@ -468,17 +569,51 @@ def next_byte():
             else:
                 print("not implemented: " + selector[sel])
                 assert False
-                pass
     return data[-1]
 
 
-if __name__ == "__main__":
-    # Work around the fact that one of the local files is called
-    # types.py, which breaks some versions of python.
-    import os, sys
+################################################################################
+# Helper functions.
+################################################################################
 
-    path = os.path.abspath(os.path.dirname(__file__))
-    sys.path.remove(path)
+
+def _to_uleb(value: int) -> bytearray:
+    """Encode an integer to ULEB128 bytes."""
+    if value < 0:
+        raise ValueError(f"negative number cannot be encoded to ULEB128: 
{value}")
+
+    result = bytearray()
+    while True:
+        byte = value & 0x7F
+        value >>= 7
+        if value != 0:
+            byte |= 0x80
+        result.append(byte)
+        if value == 0:
+            break
+
+    return result
+
+
+def _from_uleb(stream: BinaryIO) -> int:
+    """Decode a ULEB128 integer by reading bytes from the stream."""
+    result = 0
+    shift = 0
+    while True:
+        byte = stream.read(1)[0]
+        result |= (byte & 0x7F) << shift
+        shift += 7
+        if not (byte & 0x80):
+            break
+
+    return result
+
+
+def _to_byte(n: int) -> bytes:
+    return n.to_bytes(1, "big")
+
+
+def _main():
     import argparse
 
     parser = argparse.ArgumentParser(
@@ -487,43 +622,123 @@ def next_byte():
     See https://lldb.llvm.org/resources/formatterbytecode.html for more 
details.
     """
     )
+    parser.add_argument("input", help="input file")
+    mode = parser.add_mutually_exclusive_group()
+    mode.add_argument(
+        "-c",
+        "--compile",
+        action="store_true",
+        help="compile assembler into bytecode",
+    )
+    mode.add_argument(
+        "-d",
+        "--disassemble",
+        action="store_true",
+        help="disassemble bytecode",
+    )
     parser.add_argument(
-        "-c", "--compile", type=str, help="compile assembler into bytecode"
+        "-o",
+        "--output",
+        help="output file (required for --compile)",
     )
-    parser.add_argument("-d", "--disassemble", type=str, help="disassemble 
bytecode")
     parser.add_argument("-t", "--test", action="store_true", help="run unit 
tests")
+
     args = parser.parse_args()
     if args.compile:
-        print(compile(str(args.compile)).hex())
+        if not args.output:
+            parser.error("--output is required with --compile")
+        with (
+            open(args.input) as input,
+            open(args.output, "wb") as output,
+        ):
+            compile_file(args.type_name, input, output)
+    elif args.disassemble:
+        if args.output:
+            with (
+                open(args.input, "rb") as input,
+                open(args.output, "w") as output,
+            ):
+                disassemble_file(input, output)
+        else:
+            with open(args.input, "rb") as input:
+                disassemble_file(input, sys.stdout)
+
+
+if __name__ == "__main__":
+    # Work around the fact that one of the local files is called
+    # types.py, which breaks some versions of python.
+    import os, sys
 
-    if args.disassemble:
-        print(disassemble(bytearray.fromhex(str(args.disassemble))))
+    path = os.path.abspath(os.path.dirname(__file__))
+    sys.path.remove(path)
+
+    if not ("-t" in sys.argv or "--test" in sys.argv):
+        _main()
+        sys.exit()
 
     
############################################################################
     # Tests.
     
############################################################################
-    if args.test:
-        import unittest
-
-        class TestCompiler(unittest.TestCase):
-            def test(self):
-                self.assertEqual(compile("1u dup").hex(), "200101")
-                self.assertEqual(compile('"1u dup"').hex(), "2206317520647570")
-                self.assertEqual(compile("16 < { dup } if").hex(), 
"21105210010111")
-                self.assertEqual(compile('{ { " } " } }').hex(), 
"100710052203207d20")
-
-                def roundtrip(asm):
-                    self.assertEqual(disassemble(compile(asm))[0], asm)
-
-                roundtrip("1u dup")
-                roundtrip('1u dup "1u dup"')
-                roundtrip("16 < { dup } if")
-                roundtrip('{ { " } " } }')
-
-                self.assertEqual(interpret(compile("1 1 +"), [], []), 2)
-                self.assertEqual(interpret(compile("2 1 1 + *"), [], []), 4)
-                self.assertEqual(
-                    interpret(compile('2 1 > { "yes" } { "no" } ifelse'), [], 
[]), "yes"
-                )
-
-        unittest.main(argv=[__file__])
+    import unittest
+
+    class TestCompiler(unittest.TestCase):
+
+        def test_compile(self):
+            self.assertEqual(compile("1u dup").hex(), "200101")
+            self.assertEqual(compile('"1u dup"').hex(), "2206317520647570")
+            self.assertEqual(compile("16 < { dup } if").hex(), 
"21105210010111")
+            self.assertEqual(compile('{ { " } " } }').hex(), 
"100710052203207d20")
+
+            def roundtrip(asm):
+                self.assertEqual(disassemble(compile(asm))[0], asm)
+
+            roundtrip("1u dup")
+            roundtrip("16 < { dup } if")
+            roundtrip('{ { " } " } }')
+
+            # String specific checks.
+            roundtrip('1u "2u 3u"')
+            roundtrip('"a  b"')
+            roundtrip('"a \\" b"')
+
+            self.assertEqual(interpret(compile("1 1 +"), [], []), 2)
+            self.assertEqual(interpret(compile("2 1 1 + *"), [], []), 4)
+            self.assertEqual(
+                interpret(compile('2 1 > { "yes" } { "no" } ifelse'), [], []), 
"yes"
+            )
+
+        def test_compile_file(self):
+            import io
+
+            def run_compile(type_name, asm):
+                out = io.BytesIO()
+                compile_file(type_name, io.StringIO(asm), out)
+                out.seek(0)
+                return out
+
+            def run_disassemble(binary):
+                out = io.StringIO()
+                disassemble_file(binary, out)
+                out.seek(0)
+                return out
+
+            # compile -> disassemble -> compile round-trip: binary is 
identical.
+            asm = "@summary: dup @get_value_as_unsigned call 
return\n@get_num_children: drop 5u return"
+            binary1 = run_compile("MyType", asm)
+            dis = run_disassemble(binary1)
+            binary2 = run_compile("MyType", dis.read())
+            self.assertEqual(binary1.getvalue(), binary2.getvalue())
+
+            # disassemble -> compile -> disassemble round-trip: text is 
identical.
+            dis2 = run_disassemble(binary2)
+            self.assertEqual(dis.getvalue(), dis2.getvalue())
+
+            # disassemble output contains expected signatures.
+            self.assertIn("@summary:", dis.getvalue())
+            self.assertIn("@get_num_children:", dis.getvalue())
+
+            # Duplicate signature is an error.
+            with self.assertRaises(ValueError):
+                run_compile("MyType", "@summary: 1u return\n@summary: 2u 
return")
+
+    unittest.main(argv=[__file__])

_______________________________________________
lldb-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits

[Lldb-commits] [lldb] [lldb] Add synthetic support to formatter_bytecode.py (PR #183804)

Reply via email to