https://github.com/kastiglione created https://github.com/llvm/llvm-project/pull/183804
None >From 8ab6a9e7acc6a00de32031947d65480f2f7f4a6c Mon Sep 17 00:00:00 2001 From: Dave Lee <[email protected]> Date: Thu, 26 Feb 2026 13:31:49 -0800 Subject: [PATCH] [lldb] Add synthetic support to formatter_bytecode.py --- lldb/examples/python/formatter_bytecode.py | 345 +++++++++++++++++---- 1 file changed, 280 insertions(+), 65 deletions(-) diff --git a/lldb/examples/python/formatter_bytecode.py b/lldb/examples/python/formatter_bytecode.py index 3d0d4d4274de4..377f222ea170a 100644 --- a/lldb/examples/python/formatter_bytecode.py +++ b/lldb/examples/python/formatter_bytecode.py @@ -6,6 +6,10 @@ """ from __future__ import annotations +import re +from typing import BinaryIO, Iterable, Iterator, TextIO, Tuple, Union + +BINARY_VERSION = 1 # Types type_String = 1 @@ -74,6 +78,19 @@ def define_opcode(n, mnemonic, name): sig_get_num_children = 2 sig_get_child_index = 3 sig_get_child_at_index = 4 +sig_update = 5 + +SIGNATURES = { + "summary": sig_summary, + "init": sig_init, + "get_num_children": sig_get_num_children, + "get_child_index": sig_get_child_index, + "get_child_at_index": sig_get_child_at_index, + "update": sig_update, +} + +SIGNATURE_NAMES = "|".join(SIGNATURES.keys()) +SIGNATURE_IDS = {v: k for k, v in SIGNATURES.items()} # Selectors selector = dict() @@ -119,8 +136,82 @@ def define_selector(n, name): # Compiler. ################################################################################ +_SIGNATURE_LABEL = re.compile(f"@(?:{SIGNATURE_NAMES}):$") + + +def _tokenize(assembler: str) -> list[str]: + """Convert string of assembly into tokens.""" + # With one exception, tokens are simply sequences of non-space characters. + # The one exception is string literals, which may have spaces. + + # To parse strings, which can contain escaped contents, use a "Friedl + # unrolled loop". The high level of such a regex is: + # open normal* ( special normal* )* close + # which for string literals is: + string_literal = r'" [^"\\]* (?: \\. [^"\\]* )* "' + + return re.findall(rf"{string_literal} | \S+", assembler, re.VERBOSE) + + +def _segment_by_signature(input: list[str]) -> list[Tuple[str, list[str]]]: + """Segment the input tokens along signature labels.""" + segments = [] + + # Loop state + signature = None + tokens = [] + + def conclude_segment(): + if not tokens: + raise ValueError(f"empty signature: {signature}") + segments.append((signature, tokens)) + + for token in input: + if _SIGNATURE_LABEL.match(token): + if signature: + conclude_segment() + signature = token[1:-1] # strip leading @, trailing : + tokens = [] + else: + tokens.append(token) + + if signature: + conclude_segment() + + return segments + + +def compile_file(type_name: str, input: TextIO, output: BinaryIO) -> None: + input_tokens = _tokenize(input.read()) + + signatures = {} + for sig, tokens in _segment_by_signature(input_tokens): + if sig in signatures: + raise ValueError(f"duplicate signature: {sig}") + signatures[sig] = compile_tokens(tokens) + + # FIXME: review use of ints below, check if any are in fact uleb. + bin = bytearray() + bin.extend(_to_uleb(len(type_name))) + bin.extend(bytes(type_name, encoding="utf-8")) + flags = 0 + bin.extend(_to_byte(flags)) + for sig, bc in signatures.items(): + bin.extend(_to_byte(SIGNATURES[sig])) + bin.extend(_to_uleb(len(bc))) + bin.extend(bc) + + # FIXME: is version a uleb? + output.write(_to_byte(BINARY_VERSION)) + output.write(_to_uleb(len(bin))) + output.write(bin) + def compile(assembler: str) -> bytearray: + return compile_tokens(_tokenize(assembler)) + + +def compile_tokens(tokens: list[str]) -> bytearray: """Compile assembler into bytecode""" # This is a stack of all in-flight/unterminated blocks. bytecode = [bytearray()] @@ -128,7 +219,6 @@ def compile(assembler: str) -> bytearray: def emit(byte): bytecode[-1].append(byte) - tokens = list(assembler.split(" ")) tokens.reverse() while tokens: tok = tokens.pop() @@ -152,27 +242,8 @@ def emit(byte): emit(op_lit_selector) emit(selector[tok]) elif tok[0] == '"': - s = bytearray() - done = False - chrs = tok[1:] - while not done: - quoted = False - for c in chrs: - if quoted: - s.append(ord(c)) # FIXME - quoted = False - elif c == "\\": - quoted = True - elif c == '"': - done = True - break - # FIXME assert this is last in token - else: - s.append(ord(c)) - if not done: - s.append(ord(" ")) - chrs = tokens.pop() - + # Remove backslash escaping '"' and '\'. + s = re.sub(r'\\(["\\])', r"\1", tok[1:-1]).encode() emit(op_lit_string) emit(len(s)) bytecode[-1].extend(s) @@ -187,7 +258,34 @@ def emit(byte): ################################################################################ -def disassemble(bytecode: bytearray) -> (str, int): +def disassemble_file(input: BinaryIO, output: TextIO) -> None: + import io + + stream = io.BytesIO(input.read()) + + version = stream.read(1)[0] + if version != BINARY_VERSION: + raise ValueError(f"unknown binary version: {version}") + + record_size = _from_uleb(stream) + stream.truncate(stream.tell() + record_size) + + name_size = _from_uleb(stream) + _type_name = stream.read(name_size).decode() + _flags = stream.read(1)[0] + + while True: + sig_byte = stream.read(1) + if not sig_byte: + break + sig_name = SIGNATURE_IDS[sig_byte[0]] + body_size = _from_uleb(stream) + bc = stream.read(body_size) + asm, _ = disassemble(bc) + print(f"@{sig_name}: {asm}", file=output) + + +def disassemble(bytecode: Union[bytes, bytearray]) -> Tuple[str, list[int]]: """Disassemble bytecode into (assembler, token starts)""" asm = "" all_bytes = list(bytecode) @@ -221,11 +319,14 @@ def next_byte(): asm += selector[b] elif b == op_lit_string: length = next_byte() - s = "'" - while length: - s += chr(next_byte()) - length -= 1 - asm += '"' + repr(s)[2:] + s = '"' + for _ in range(length): + c = chr(next_byte()) + if c in ('"', "\\"): + s += "\\" + s += c + s += '"' + asm += s else: asm += opcode[b] @@ -468,17 +569,51 @@ def next_byte(): else: print("not implemented: " + selector[sel]) assert False - pass return data[-1] -if __name__ == "__main__": - # Work around the fact that one of the local files is called - # types.py, which breaks some versions of python. - import os, sys +################################################################################ +# Helper functions. +################################################################################ - path = os.path.abspath(os.path.dirname(__file__)) - sys.path.remove(path) + +def _to_uleb(value: int) -> bytearray: + """Encode an integer to ULEB128 bytes.""" + if value < 0: + raise ValueError(f"negative number cannot be encoded to ULEB128: {value}") + + result = bytearray() + while True: + byte = value & 0x7F + value >>= 7 + if value != 0: + byte |= 0x80 + result.append(byte) + if value == 0: + break + + return result + + +def _from_uleb(stream: BinaryIO) -> int: + """Decode a ULEB128 integer by reading bytes from the stream.""" + result = 0 + shift = 0 + while True: + byte = stream.read(1)[0] + result |= (byte & 0x7F) << shift + shift += 7 + if not (byte & 0x80): + break + + return result + + +def _to_byte(n: int) -> bytes: + return n.to_bytes(1, "big") + + +def _main(): import argparse parser = argparse.ArgumentParser( @@ -487,43 +622,123 @@ def next_byte(): See https://lldb.llvm.org/resources/formatterbytecode.html for more details. """ ) + parser.add_argument("input", help="input file") + mode = parser.add_mutually_exclusive_group() + mode.add_argument( + "-c", + "--compile", + action="store_true", + help="compile assembler into bytecode", + ) + mode.add_argument( + "-d", + "--disassemble", + action="store_true", + help="disassemble bytecode", + ) parser.add_argument( - "-c", "--compile", type=str, help="compile assembler into bytecode" + "-o", + "--output", + help="output file (required for --compile)", ) - parser.add_argument("-d", "--disassemble", type=str, help="disassemble bytecode") parser.add_argument("-t", "--test", action="store_true", help="run unit tests") + args = parser.parse_args() if args.compile: - print(compile(str(args.compile)).hex()) + if not args.output: + parser.error("--output is required with --compile") + with ( + open(args.input) as input, + open(args.output, "wb") as output, + ): + compile_file(args.type_name, input, output) + elif args.disassemble: + if args.output: + with ( + open(args.input, "rb") as input, + open(args.output, "w") as output, + ): + disassemble_file(input, output) + else: + with open(args.input, "rb") as input: + disassemble_file(input, sys.stdout) + + +if __name__ == "__main__": + # Work around the fact that one of the local files is called + # types.py, which breaks some versions of python. + import os, sys - if args.disassemble: - print(disassemble(bytearray.fromhex(str(args.disassemble)))) + path = os.path.abspath(os.path.dirname(__file__)) + sys.path.remove(path) + + if not ("-t" in sys.argv or "--test" in sys.argv): + _main() + sys.exit() ############################################################################ # Tests. ############################################################################ - if args.test: - import unittest - - class TestCompiler(unittest.TestCase): - def test(self): - self.assertEqual(compile("1u dup").hex(), "200101") - self.assertEqual(compile('"1u dup"').hex(), "2206317520647570") - self.assertEqual(compile("16 < { dup } if").hex(), "21105210010111") - self.assertEqual(compile('{ { " } " } }').hex(), "100710052203207d20") - - def roundtrip(asm): - self.assertEqual(disassemble(compile(asm))[0], asm) - - roundtrip("1u dup") - roundtrip('1u dup "1u dup"') - roundtrip("16 < { dup } if") - roundtrip('{ { " } " } }') - - self.assertEqual(interpret(compile("1 1 +"), [], []), 2) - self.assertEqual(interpret(compile("2 1 1 + *"), [], []), 4) - self.assertEqual( - interpret(compile('2 1 > { "yes" } { "no" } ifelse'), [], []), "yes" - ) - - unittest.main(argv=[__file__]) + import unittest + + class TestCompiler(unittest.TestCase): + + def test_compile(self): + self.assertEqual(compile("1u dup").hex(), "200101") + self.assertEqual(compile('"1u dup"').hex(), "2206317520647570") + self.assertEqual(compile("16 < { dup } if").hex(), "21105210010111") + self.assertEqual(compile('{ { " } " } }').hex(), "100710052203207d20") + + def roundtrip(asm): + self.assertEqual(disassemble(compile(asm))[0], asm) + + roundtrip("1u dup") + roundtrip("16 < { dup } if") + roundtrip('{ { " } " } }') + + # String specific checks. + roundtrip('1u "2u 3u"') + roundtrip('"a b"') + roundtrip('"a \\" b"') + + self.assertEqual(interpret(compile("1 1 +"), [], []), 2) + self.assertEqual(interpret(compile("2 1 1 + *"), [], []), 4) + self.assertEqual( + interpret(compile('2 1 > { "yes" } { "no" } ifelse'), [], []), "yes" + ) + + def test_compile_file(self): + import io + + def run_compile(type_name, asm): + out = io.BytesIO() + compile_file(type_name, io.StringIO(asm), out) + out.seek(0) + return out + + def run_disassemble(binary): + out = io.StringIO() + disassemble_file(binary, out) + out.seek(0) + return out + + # compile -> disassemble -> compile round-trip: binary is identical. + asm = "@summary: dup @get_value_as_unsigned call return\n@get_num_children: drop 5u return" + binary1 = run_compile("MyType", asm) + dis = run_disassemble(binary1) + binary2 = run_compile("MyType", dis.read()) + self.assertEqual(binary1.getvalue(), binary2.getvalue()) + + # disassemble -> compile -> disassemble round-trip: text is identical. + dis2 = run_disassemble(binary2) + self.assertEqual(dis.getvalue(), dis2.getvalue()) + + # disassemble output contains expected signatures. + self.assertIn("@summary:", dis.getvalue()) + self.assertIn("@get_num_children:", dis.getvalue()) + + # Duplicate signature is an error. + with self.assertRaises(ValueError): + run_compile("MyType", "@summary: 1u return\n@summary: 2u return") + + unittest.main(argv=[__file__]) _______________________________________________ lldb-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits
