[Lldb-commits] [lldb] [lldb] Add synthetic support to formatter_bytecode.py (PR #183804)

Dave Lee via lldb-commits Fri, 27 Feb 2026 12:00:39 -0800

================
@@ -119,16 +136,89 @@ def define_selector(n, name):
 # Compiler.
 
################################################################################
 
+_SIGNATURE_LABEL = re.compile(f"@(?:{SIGNATURE_NAMES}):$")
+
+
+def _tokenize(assembler: str) -> list[str]:
+    """Convert string of assembly into tokens."""
+    # With one exception, tokens are simply sequences of non-space characters.
+    # The one exception is string literals, which may have spaces.
+
+    # To parse strings, which can contain escaped contents, use a "Friedl
+    # unrolled loop". The high level of such a regex is:
+    #     open normal* ( special normal* )* close
+    # which for string literals is:
+    string_literal = r'" [^"\\]* (?: \\. [^"\\]* )* "'
+
+    return re.findall(rf"{string_literal} | \S+", assembler, re.VERBOSE)
+
+
+def _segment_by_signature(input: list[str]) -> list[Tuple[str, list[str]]]:
+    """Segment the input tokens along signature labels."""
+    segments = []
+
+    # Loop state
+    signature = None
+    tokens = []
+
+    def conclude_segment():
+        if not tokens:
+            raise ValueError(f"empty signature: {signature}")
+        segments.append((signature, tokens))
+
+    for token in input:
+        if _SIGNATURE_LABEL.match(token):
+            if signature:
+                conclude_segment()
+            signature = token[1:-1]  # strip leading @, trailing :
+            tokens = []
+        else:
+            tokens.append(token)
+
+    if signature:
+        conclude_segment()
+
+    return segments
+
+
+def compile_file(type_name: str, input: TextIO, output: BinaryIO) -> None:
+    input_tokens = _tokenize(input.read())
+
+    signatures = {}
+    for sig, tokens in _segment_by_signature(input_tokens):
+        if sig in signatures:
+            raise ValueError(f"duplicate signature: {sig}")
+        signatures[sig] = compile_tokens(tokens)
+
+    # FIXME: review use of ints below, check if any are in fact uleb.
+    bin = bytearray()
+    bin.extend(_to_uleb(len(type_name)))
+    bin.extend(bytes(type_name, encoding="utf-8"))
+    flags = 0
+    bin.extend(_to_byte(flags))
+    for sig, bc in signatures.items():
+        bin.extend(_to_byte(SIGNATURES[sig]))
+        bin.extend(_to_uleb(len(bc)))
+        bin.extend(bc)
+
+    # FIXME: is version a uleb?
+    output.write(_to_byte(BINARY_VERSION))
----------------
kastiglione wrote:


Good point, all the ints in this function are all guaranteed to be less than 
0x7f (flags, signature, version).

https://github.com/llvm/llvm-project/pull/183804
_______________________________________________
lldb-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits

[Lldb-commits] [lldb] [lldb] Add synthetic support to formatter_bytecode.py (PR #183804)

Reply via email to