[PATCH v2 13/28] docs: c_lex: properly implement a sub() method for CMatch

Mauro Carvalho Chehab Thu, 12 Mar 2026 07:57:20 -0700

Change the sub() method to do what it is expected, parsing
backref arguments like \0, \1, \2, ...


Signed-off-by: Mauro Carvalho Chehab <[email protected]>
---
 tools/lib/python/kdoc/c_lex.py | 240 +++++++++++++++++++++++++++------
 1 file changed, 202 insertions(+), 38 deletions(-)

diff --git a/tools/lib/python/kdoc/c_lex.py b/tools/lib/python/kdoc/c_lex.py
index e986a4ad73e3..98031cb7907c 100644
--- a/tools/lib/python/kdoc/c_lex.py
+++ b/tools/lib/python/kdoc/c_lex.py
@@ -10,6 +10,8 @@ Those help caching regular expressions and do matching for 
kernel-doc.
 
 import re
 
+from copy import copy
+
 from .kdoc_re import KernRe
 
 class CToken():
@@ -36,6 +38,8 @@ class CToken():
     NAME = 14       #: A name. Can be an ID or a type.
     SPACE = 15      #: Any space characters, including new lines
 
+    BACKREF = 16  #: Not a valid C sequence, but used at sub regex patterns.
+
     MISMATCH = 255  #: an error indicator: should never happen in practice.
 
     # Dict to convert from an enum interger into a string.
@@ -107,6 +111,8 @@ TOKEN_LIST = [
 
     (CToken.SPACE,   r"[\s]+"),
 
+    (CToken.BACKREF, r"\\\d+"),
+
     (CToken.MISMATCH,r"."),
 ]
 
@@ -245,6 +251,167 @@ class CTokenizer():
         return out
 
 
+class CTokenArgs:
+    """
+    Ancillary class to help using backrefs from sub matches.
+
+    If the highest backref contain a "+" at the last element,
+    the logic will be greedy, picking all other delims.
+
+    This is needed to parse struct_group macros with end with ``MEMBERS...``.
+    """
+    def __init__(self, sub_str):
+        self.sub_groups = set()
+        self.max_group = -1
+        self.greedy = None
+
+        for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str):
+            group = int(m.group(1))
+            if m.group(2) == "+":
+                if self.greedy and self.greedy != group:
+                    raise ValueError("There are multiple greedy patterns!")
+                self.greedy = group
+
+            self.sub_groups.add(group)
+            self.max_group = max(self.max_group, group)
+
+        if self.greedy:
+            if self.greedy != self.max_group:
+                raise ValueError("Greedy pattern is not the last one!")
+
+            sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str)
+
+        self.sub_str = sub_str
+        self.sub_tokeninzer = CTokenizer(sub_str)
+
+    def groups(self, new_tokenizer):
+        """
+        Create replacement arguments for backrefs like:
+
+        ``\0``, ``\1``, ``\2``, ...``\n``
+
+        It also accepts a ``+`` character to the highest backref. When used,
+        it means in practice to ignore delimins after it, being greedy.
+
+        The logic is smart enough to only go up to the maximum required
+        argument, even if there are more.
+
+        If there is a backref for an argument above the limit, it will
+        raise an exception. Please notice that, on C, square brackets
+        don't have any separator on it. Trying to use ``\1``..``\n`` for
+        brackets also raise an exception.
+        """
+
+        level = (0, 0, 0)
+
+        if self.max_group < 0:
+            return level, []
+
+        tokens = new_tokenizer.tokens
+
+        #
+        # Fill \0 with the full token contents
+        #
+        groups_list = [ [] ]
+
+        if 0 in self.sub_groups:
+            inner_level = 0
+
+            for i in range(0, len(tokens)):
+                tok = tokens[i]
+
+                if tok.kind == CToken.BEGIN:
+                    inner_level += 1
+                    continue
+
+                if tok.kind == CToken.END:
+                    inner_level -= 1
+                    if inner_level < 0:
+                        break
+
+                if inner_level:
+                    groups_list[0].append(tok)
+
+        if not self.max_group:
+            return level, groups_list
+
+        delim = None
+
+        #
+        # Ignore everything before BEGIN. The value of begin gives the
+        # delimiter to be used for the matches
+        #
+        for i in range(0, len(tokens)):
+            tok = tokens[i]
+            if tok.kind == CToken.BEGIN:
+                if tok.value == "{":
+                    delim = ";"
+                elif tok.value == "(":
+                    delim = ","
+                else:
+                    raise ValueError(fr"Can't handle \1..\n on {sub_str}")
+
+                level = tok.level
+                break
+
+        pos = 1
+        groups_list.append([])
+
+        inner_level = 0
+        for i in range(i + 1, len(tokens)):
+            tok = tokens[i]
+
+            if tok.kind == CToken.BEGIN:
+                inner_level += 1
+            if tok.kind == CToken.END:
+                inner_level -= 1
+                if inner_level < 0:
+                    break
+
+            if tok.kind == CToken.PUNC and delim == tok.value:
+                pos += 1
+                if self.greedy and pos > self.max_group:
+                    pos -= 1
+                else:
+                    groups_list.append([])
+
+                    if pos > self.max_group:
+                        break
+
+                    continue
+
+            groups_list[pos].append(tok)
+
+        if pos < self.max_group:
+            raise ValueError(fr"{self.sub_str} groups are up to {pos} instead 
of {self.max_group}")
+
+        return level, groups_list
+
+    def tokens(self, new_tokenizer):
+        level, groups = self.groups(new_tokenizer)
+
+        new = CTokenizer()
+
+        for tok in self.sub_tokeninzer.tokens:
+            if tok.kind == CToken.BACKREF:
+                group = int(tok.value[1:])
+
+                for group_tok in groups[group]:
+                    new_tok = copy(group_tok)
+
+                    new_level = [0, 0, 0]
+
+                    for i in range(0, len(level)):
+                        new_level[i] = new_tok.level[i] + level[i]
+
+                    new_tok.level = tuple(new_level)
+
+                    new.tokens += [ new_tok ]
+            else:
+                new.tokens += [ tok ]
+
+        return new.tokens
+
 class CMatch:
     """
     Finding nested delimiters is hard with regular expressions. It is
@@ -270,31 +437,9 @@ class CMatch:
     will ignore the search string.
     """
 
-    # TODO: make CMatch handle multiple match groups
-    #
-    # Right now, regular expressions to match it are defined only up to
-    #       the start delimiter, e.g.:
-    #
-    #       \bSTRUCT_GROUP\(
-    #
-    # is similar to: STRUCT_GROUP\((.*)\)
-    # except that the content inside the match group is delimiter-aligned.
-    #
-    # The content inside parentheses is converted into a single replace
-    # group (e.g. r`\0').
-    #
-    # It would be nice to change such definition to support multiple
-    # match groups, allowing a regex equivalent to:
-    #
-    #   FOO\((.*), (.*), (.*)\)
-    #
-    # it is probably easier to define it not as a regular expression, but
-    # with some lexical definition like:
-    #
-    #   FOO(arg1, arg2, arg3)
 
     def __init__(self, regex):
-        self.regex = KernRe(regex)
+        self.regex = KernRe("^" + regex + r"\b")
 
     def _search(self, tokenizer):
         """
@@ -317,7 +462,6 @@ class CMatch:
         """
 
         start = None
-        offset = -1
         started = False
 
         import sys
@@ -339,9 +483,8 @@ class CMatch:
 
             if tok.kind == CToken.END and tok.level == stack[-1][1]:
                 start, level = stack.pop()
-                offset = i
 
-                yield CTokenizer(tokenizer.tokens[start:offset + 1])
+                yield start, i
                 start = None
 
         #
@@ -349,9 +492,9 @@ class CMatch:
         # This is meant to solve cases where the caller logic might be
         # picking an incomplete block.
         #
-        if start and offset < 0:
+        if start and stack:
             print("WARNING: can't find an end", file=sys.stderr)
-            yield CTokenizer(tokenizer.tokens[start:])
+            yield start, len(tokenizer.tokens)
 
     def search(self, source):
         """
@@ -368,13 +511,15 @@ class CMatch:
             tokenizer = CTokenizer(source)
             is_token = False
 
-        for new_tokenizer in self._search(tokenizer):
+        for start, end in self._search(tokenizer):
+            new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1])
+
             if is_token:
                 yield new_tokenizer
             else:
                 yield str(new_tokenizer)
 
-    def sub(self, sub, line, count=0):
+    def sub(self, sub_str, source, count=0):
         """
         This is similar to re.sub:
 
@@ -398,20 +543,39 @@ class CMatch:
             is_token = False
             tokenizer = CTokenizer(source)
 
+        # Detect if sub_str contains sub arguments
+
+        args_match = CTokenArgs(sub_str)
+
         new_tokenizer = CTokenizer()
-        cur_pos = 0
+        pos = 0
+        n = 0
+
+        #
+        # NOTE: the code below doesn't consider overlays at sub.
+        # We may need to add some extra unit tests to check if those
+        # would cause problems. When replacing by "", this should not
+        # be a problem, but other transformations could be problematic
+        #
         for start, end in self._search(tokenizer):
-            new_tokenizer.tokens += tokenizer.tokens[cur_pos:start]
-#            new_tokenizer.tokens += [sub_str]
+            new_tokenizer.tokens += tokenizer.tokens[pos:start]
 
-            cur_pos = end + 1
+            new = CTokenizer(tokenizer.tokens[start:end + 1])
 
-        if cur_pos:
-            new_tokenizer.tokens += tokenizer.tokens[cur_pos:]
+            new_tokenizer.tokens += args_match.tokens(new)
 
-        print(new_tokenizer.tokens)
+            pos = end + 1
 
-        return str(new_tokenizer)
+            n += 1
+            if count and n >= count:
+                break
+
+        new_tokenizer.tokens += tokenizer.tokens[pos:]
+
+        if not is_token:
+            return str(new_tokenizer)
+
+        return new_tokenizer
 
     def __repr__(self):
         """
-- 
2.52.0

[PATCH v2 13/28] docs: c_lex: properly implement a sub() method for CMatch

Reply via email to