[llvm-branch-commits] [llvm] [Dexter] Add basic result evaluation for structured scripts (PR #198803)

Stephen Tozer via llvm-branch-commits Mon, 25 May 2026 15:35:24 -0700

https://github.com/SLTozer updated 
https://github.com/llvm/llvm-project/pull/198803


>From 35329614671f39c5234effdf2fd9564a3cfdce30 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <[email protected]>
Date: Wed, 13 May 2026 17:16:02 +0100
Subject: [PATCH 1/4] [Dexter] Add basic result evaluation for structured
 scripts

This patch adds evaluation for structured scripts, completing the features
required to run simple Dexter tests using structured scripts. The basic
output from these evaluations is a list of named metrics aggregating the
results of evaluating !value nodes. The verbose output gives a per-step
summary of the results for each expect node active at that step.

Most of the new functionality is in the evaluation/ dir, which has also
absorbed some functionality previously stored in the
ScriptDebuggerController for matching !where nodes to a debugger StepIR,
as this is logic which is common to both managing a debugger session and
evaluating the end result.
---
 .../ScriptDebuggerController.py               |  85 +------------
 .../dexter/dex/evaluation/ExpectMatch.py      |  36 ++++++
 .../dexter/dex/evaluation/Metrics.py          | 120 ++++++++++++++++++
 .../dexter/dex/evaluation/RunMatch.py         | 120 ++++++++++++++++++
 .../dexter/dex/evaluation/StateMatch.py       |  92 ++++++++++++++
 .../dexter/dex/evaluation/__init__.py         |  10 ++
 .../dexter/dex/test_script/__init__.py        |  10 ++
 .../dexter/dex/tools/test/Tool.py             |  63 ++++++---
 .../scripts/evaluation/basic_evaluate.cpp     |  42 ++++++
 .../scripts/evaluation/evaluate_nothing.cpp   |  17 +++
 10 files changed, 496 insertions(+), 99 deletions(-)
 create mode 100644 
cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py
 create mode 100644 
cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py
 create mode 100644 
cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py
 create mode 100644 
cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py
 create mode 100644 
cross-project-tests/debuginfo-tests/dexter/dex/evaluation/__init__.py
 create mode 100644 
cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp
 create mode 100644 
cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp

diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ScriptDebuggerController.py
 
b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ScriptDebuggerController.py
index c8d5dff4853ca..a130d7ceadedf 100644
--- 
a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ScriptDebuggerController.py
+++ 
b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ScriptDebuggerController.py
@@ -9,98 +9,19 @@
 
 
 from enum import Enum
-import os
 import time
-from typing import Dict, List, Tuple
 
 from dex.debugger.DebuggerControllers.DebuggerControllerBase import (
     DebuggerControllerBase,
 )
 from dex.debugger.DebuggerBase import DebuggerBase
 from dex.debugger.DAP import DAP
-from dex.test_script.Nodes import Expect, Value, Where
+from dex.evaluation.StateMatch import get_active_where_expects
+from dex.test_script.Nodes import Where
 from dex.test_script.Script import DexterScript, Scope
 from dex.tools import Context
 from dex.utils.Timeout import Timeout
-from dex.dextIR import DextIR, FrameIR, StepIR
-
-
-def is_subpath(subpath: str, superpath: str) -> bool:
-    """Returns True if subpath is not a trailing subpath of superpath, i.e. if 
`superpath` does not end with `subpath`
-    after normalizing both paths."""
-    normalized_subpath: str = os.path.normcase(os.path.normpath(subpath))
-    normalized_superpath: str = os.path.normcase(os.path.normpath(superpath))
-    return normalized_superpath.endswith(normalized_subpath)
-
-
-def match_where_to_frame(
-    where: Where,
-    frame: FrameIR,
-) -> bool:
-    """A very simple matcher, returns True iff `where` matches `frame`."""
-    if where.file is not None and not is_subpath(where.file, frame.loc.path):
-        return False
-    if where.function is not None:
-        fn = frame.function
-        if "(" in fn:
-            fn = fn.split("(")[0]
-        if where.function != fn:
-            return False
-    if where.lines is not None:
-        if frame.loc.lineno not in where.get_lines():
-            return False
-    if (
-        where.for_hit_count is not None
-        or where.after_hit_count is not None
-        or where.conditions is not None
-    ):
-        raise NotImplementedError(
-            "!where hit counts and conditions currently unsupported."
-        )
-    return True
-
-
-def get_active_where_expects(
-    script: DexterScript, step_info: StepIR
-) -> Dict[Where, Tuple[int, List[Value]]]:
-    """Match the script against the step_info, producing a dict that maps each 
!where that matches a stack frame to the
-    index of the (outermost) stack frame that it matches, and if the frame 
that it matches is the current stack frame
-    (i.e. the frame index is 0), also includes a list of every direct child 
!expect node for that !where.
-    """
-    active_where_expects: Dict[Where, Tuple[int, List[Value]]] = {}
-
-    def get_active_wheres(where: Where, scope: Scope):
-        if scope.where:
-            raise NotImplementedError(
-                "Support for nested !where nodes currently unimplemented."
-            )
-        # For this !where, search for the lowest stack frame (e.g. the 
outermost call) that matches it.
-        matching_frame_idx = next(
-            (
-                frame_idx
-                for frame_idx, frame in 
reversed(list(enumerate(step_info.frames)))
-                if match_where_to_frame(where, frame)
-            ),
-            None,
-        )
-        if matching_frame_idx is not None:
-            active_where_expects[where] = (matching_frame_idx, [])
-
-    # As we visit the script nodes in pre-order traversal, we can always 
assume that an expect's parent !where
-    # has already been visited, and thus should have an entry in 
active_where_expects if it is active.
-    def get_active_expects(expect: Expect, expected_value, scope: Scope):
-        assert isinstance(
-            expect, Value
-        ), "Values should be the only type of expect possible!"
-        if (
-            scope.where in active_where_expects
-            and active_where_expects[scope.where][0] == 0
-        ):
-            active_where_expects[scope.where][1].append(expect)
-
-    script.visit_script(visit_where=get_active_wheres, 
visit_expect=get_active_expects)
-
-    return active_where_expects
+from dex.dextIR import DextIR, StepIR
 
 
 class DebuggerAction(Enum):
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py
new file mode 100644
index 0000000000000..a8f3f584d1e78
--- /dev/null
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py
@@ -0,0 +1,36 @@
+# DExTer : Debugging Experience Tester
+# ~~~~~~   ~         ~~         ~   ~~
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Utilities for matching debugger output to script expected values."""
+
+from typing import Any, Dict, List, Union
+
+from dex.dextIR import ValueIR
+from dex.test_script.Nodes import Expect, Value
+
+
+
+class DebuggerExpectMatch:
+    """Class that represents the match between a particular expected value for 
an Expect node and the actual debugger
+    output corresponding to the watched value for that node."""
+    def __init__(self, expect: Expect, expected, actual: ValueIR):
+        self.expect = expect
+        self.expected = expected
+        self.actual = actual
+        self.actual_result = self.expect.get_variable_result(self.actual)
+        self.match_result = self.expected is not None and str(self.expected) 
== self.actual_result
+
+def get_expect_match(expect: Expect, expected_values, actual: ValueIR):
+    """Given one or more expected values for an Expect node and an actual 
ValueIR, returns a match for the first
+    matching expected values, or for None if there are no matching expected 
values."""
+    if not isinstance(expected_values, list):
+        expected_values = [expected_values]
+    for expected_value in expected_values:
+        expect_match = DebuggerExpectMatch(expect, expected_value, actual)
+        if expect_match.match_result:
+            return expect_match
+    return DebuggerExpectMatch(expect, None, actual)
+
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py
new file mode 100644
index 0000000000000..783f30c95b062
--- /dev/null
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py
@@ -0,0 +1,120 @@
+# DExTer : Debugging Experience Tester
+# ~~~~~~   ~         ~~         ~   ~~
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Produce metric results from the results of a comparison of a DexterScript 
and debugger output.
+"""
+
+from typing import Any, Dict, List, Union
+
+from dex.evaluation.ExpectMatch import DebuggerExpectMatch
+from dex.test_script.Nodes import Expect, Value
+
+
+class Metric:
+    def __init__(self, improves_asc = True):
+        self.improves_asc = improves_asc
+
+    def as_scalar(self) -> float:
+        raise NotImplementedError()
+
+    def aggregate(self, other):
+        raise NotImplementedError()
+
+    # Returns 1 if this metric is better than "other", -1 if it worse, and 0 
if it is the same.
+    def compare(self, other):
+        a = self.as_scalar()
+        b = other.as_scalar()
+        if not self.improves_asc:
+            a, b = b, a
+        if a > b:
+            return 1
+        elif a < b:
+            return -1
+        else:
+            return 0
+
+class ScalarMetric(Metric):
+    def __init__(self, value: Union[int, float], improves_asc = True):
+        self.value = value
+        super().__init__(improves_asc)
+
+    def as_scalar(self) -> float:
+        return float(self.value)
+
+    def aggregate(self, other):
+        return ScalarMetric(self.value + other.value, self.improves_asc)
+
+    def __repr__(self):
+        return f"{self.value}"
+
+class FractionMetric(Metric):
+    def __init__(self, numerator: int, denominator: int, improves_asc = True):
+        self.num = numerator
+        self.dom = denominator
+        super().__init__(improves_asc)
+
+    def as_scalar(self) -> float:
+        return float(self.num) / float(self.dom)
+
+    def as_pct(self) -> float:
+        return self.as_scalar() * 100
+
+    def aggregate(self, other):
+        return FractionMetric(self.num + other.num, self.dom + other.dom, 
self.improves_asc)
+
+    def __repr__(self):
+        return f"{self.as_pct():.1f}% ({self.num}/{self.dom})"
+
+def serialize_metric_to_json(metric):
+    if isinstance(metric, ScalarMetric):
+        return metric.value
+    elif isinstance(metric, FractionMetric):
+        return metric.as_pct()
+    raise Exception("Invalid metric type!")
+
+def get_variable_metrics(expect: Expect, expected_values: Any, matches: 
List[DebuggerExpectMatch]) -> Dict[str, Metric]:
+    """Given an Expect node with its expected values and a list of all matches 
for that Expect in a debugger session,
+    returns the computed metrics for that Expect node."""
+    assert isinstance(expect, Value), "Non-Value expects currently unsupported"
+    if not isinstance(expected_values, list):
+        expected_values = [expected_values]
+    num_total_steps = len(matches)
+    seen_expected_values = set()
+    num_correct_steps = 0
+    num_missing_var_steps = 0
+    num_unexpected_value_steps = 0
+    for match in matches:
+        if match.match_result:
+            seen_expected_values.add(match.expected)
+            num_correct_steps += 1
+        elif match.actual_result is None:
+            num_missing_var_steps += 1
+        else:
+            num_unexpected_value_steps += 1
+    num_seen_values = sum(1 for ev in expected_values if ev in 
seen_expected_values)
+    # And finally produce the metrics map and add the new result to the list.
+    metrics = {
+        # The number of steps. Though this is not a useful metric in itself, 
it may be useful to see in tandem with
+        # other variables.
+        "total_watched_steps": ScalarMetric(num_total_steps),
+        # The number of steps where the expected value sequence was observed.
+        "correct_steps": ScalarMetric(num_correct_steps),
+        # The number of steps which did not match the expected value sequence.
+        "incorrect_steps": ScalarMetric(num_total_steps - num_correct_steps, 
improves_asc=False),
+        # The number of steps where the watched variable/expression was not 
available in the debugger.
+        "missing_var_steps": ScalarMetric(num_missing_var_steps, 
improves_asc=False),
+        # The number of steps where the watched variable/expression had a 
value not in the set of expected values.
+        "unexpected_value_steps": ScalarMetric(
+            num_unexpected_value_steps, improves_asc=False
+        ),
+        # The % of steps where the expected value sequence was observed.
+        "correct_step_coverage": FractionMetric(num_correct_steps, 
num_total_steps),
+        # The number of expected values that were observed at least once.
+        "seen_values": ScalarMetric(num_seen_values),
+        # The number of expected values that were not observed.
+        "missing_values": ScalarMetric(len(expected_values) - num_seen_values, 
improves_asc=False),
+    }
+    return metrics
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py
new file mode 100644
index 0000000000000..f24629e8b0500
--- /dev/null
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py
@@ -0,0 +1,120 @@
+# DExTer : Debugging Experience Tester
+# ~~~~~~   ~         ~~         ~   ~~
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Classes for matching observed debugger output to script expectations.
+"""
+
+# For each command, there is a set of metrics that can be generated. Metrics 
across multiple identical commands can be
+# aggregated, and each individual metric can be expressed in a scalar form 
that is considered "better" as it either
+# ascends or descends.
+from collections import defaultdict
+from typing import Any, Dict, List, Tuple
+
+from dex.dextIR import DextIR, StepIR
+from dex.evaluation.ExpectMatch import DebuggerExpectMatch, get_expect_match
+from dex.evaluation.Metrics import Metric, get_variable_metrics, 
serialize_metric_to_json
+from dex.evaluation.StateMatch import get_active_where_expects
+from dex.test_script import DexterScript, Scope
+from dex.test_script.Nodes import Expect, Value
+
+class DebuggerStepMatch:
+    """Class used to record the match between a DexterScript and a StepIR, 
including the state match, determining which
+    script nodes are "active", and the expect matches, which compare the 
debugger's output to the DexterScript's
+    expected output."""
+    def __init__(self, step: StepIR, script: DexterScript):
+        self.step = step
+        self.script = script
+        self.state_match = get_active_where_expects(script, step)
+        expects_to_match = {expect for frame_idx, expects in 
self.state_match.values() for expect in expects}
+        self.expect_matches: Dict[Expect, DebuggerExpectMatch] = {}
+        def add_expected_values(expect: Expect, expected_value: Any, scope: 
Scope):
+            assert isinstance(expect, Value), "Non-Value expects currently 
unsupported"
+            if expect in expects_to_match:
+                self.expect_matches[expect] = get_expect_match(expect, 
expected_value, step.watches[expect.get_watched_expr()])
+        script.visit_script(visit_expect=add_expected_values)
+
+class DebuggerRunMatch(object):
+    """Class used to record the complete match of a debugger session and a 
DexterScript. Compares debugger steps to the
+    script one-at-a-time, rather than comparing individual variables 
longtitudinally, as there will exist some shared
+    state across evaluation that is updated step-by-step and can be shared 
across variables."""
+    def __init__(self, context, dext_ir: DextIR):
+        self.context = context
+        self.dext_ir = dext_ir
+        self.metrics: Dict[str, Metric] = {}
+        self.step_matches: List[DebuggerStepMatch] = []
+        self.per_expect_results: Dict[
+            Expect, list[Tuple[int, DebuggerExpectMatch]]
+        ] = {}
+
+        script = self.dext_ir.script
+        assert script is not None, "Trying to evaluate DextIR without attached 
script?"
+
+        # Gather the expected values for each Expect.
+        expected_values = {}
+        def add_expected_values(expect: Expect, expected_value: Any, scope: 
Scope):
+            assert isinstance(expect, Value), "Non-Value expects currently 
unsupported"
+            expected_values[expect] = expected_value
+            self.per_expect_results[expect] = []
+        script.visit_script(visit_expect=add_expected_values)
+
+        # Then produce all of our step matches.
+        for step in self.dext_ir.steps:
+            self.step_matches.append(DebuggerStepMatch(step, script))
+
+        # Then, for each expect, produce the list of results for just that 
variable.
+        for step_match in self.step_matches:
+            for expect, expect_match in step_match.expect_matches.items():
+                
self.per_expect_results[expect].append((step_match.step.step_index, 
expect_match))
+
+        # Finally, compare the match results against the expected values to 
produce the metrics.
+        for expect, expect_results in self.per_expect_results.items():
+            expect_matches = [match for step, match in expect_results]
+            expect_metrics = get_variable_metrics(expect, 
expected_values[expect], expect_matches)
+            for metric_name, metric in expect_metrics.items():
+                if metric_name not in self.metrics:
+                    self.metrics[metric_name] = metric
+                else:
+                    self.metrics[metric_name] = 
self.metrics[metric_name].aggregate(metric)
+
+
+    def dump_step_results(self) -> str:
+        result = ""
+        for step_match in self.step_matches:
+            result += f"Step {step_match.step.step_index}:\n"
+            result += f"  {step_match.step.current_location}\n"
+            frame_active_wheres = defaultdict(list)
+            for where, (frame_idx, expects) in step_match.state_match.items():
+                frame_active_wheres[frame_idx].append(str(where))
+            if not frame_active_wheres:
+                result += f"  No active !where nodes.\n"
+                continue
+            frame_active_wheres_list = sorted([(frame_idx, wheres) for 
frame_idx, wheres in frame_active_wheres.items()], key=lambda entry: entry[0])
+            result += f"  Active !where nodes:\n"
+            for frame_idx, wheres in frame_active_wheres_list:
+                result += f"    Frame {frame_idx}: [{', '.join(wheres)}]\n"
+            if not step_match.expect_matches:
+                continue
+            result += f"  Active !expect nodes:\n"
+            matching_expects = [(expect, match) for expect, match in 
step_match.expect_matches.items() if match.match_result]
+            non_matching_expects = [(expect, match) for expect, match in 
step_match.expect_matches.items() if not match.match_result]
+            if matching_expects:
+                result += f"    Matching nodes:     [{', 
'.join(f'{expect}={match.actual_result}' for expect, match in 
matching_expects)}]\n"
+            if non_matching_expects:
+                result += f"    Non-matching nodes: [{', 
'.join(f'{expect}={match.actual_result}' for expect, match in 
non_matching_expects)}]\n"
+        return result
+
+    def get_metric_output(self):
+        if not self.metrics:
+            return "No expects found."
+        lines = []
+        for metric_type, metric in self.metrics.items():
+            lines.append(f"{metric_type}: {metric}")
+        return "\n".join(lines) + "\n"
+
+    def get_metric_json_output(self):
+        if not self.metrics:
+            return "No expects found."
+        return {metric_type: serialize_metric_to_json(metric) for metric_type, 
metric in self.metrics.items()}
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py
new file mode 100644
index 0000000000000..cb532dbc5e198
--- /dev/null
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py
@@ -0,0 +1,92 @@
+# DExTer : Debugging Experience Tester
+# ~~~~~~   ~         ~~         ~   ~~
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Utilities for matching debugger state, such as the call stack, conditions, 
or historical state (e.g. breakpoint
+hitcounts) to descriptions of expected state in a DexterScript."""
+
+import os
+from typing import Dict, List, Tuple
+
+from dex.dextIR import FrameIR, StepIR
+from dex.test_script import DexterScript, Scope
+from dex.test_script.Nodes import Expect, Value, Where
+
+
+def is_subpath(subpath: str, superpath: str) -> bool:
+    """Returns True if subpath is not a trailing subpath of superpath, i.e. if 
`superpath` does not end with `subpath`
+    after normalizing both paths."""
+    normalized_subpath: str = os.path.normcase(os.path.normpath(subpath))
+    normalized_superpath: str = os.path.normcase(os.path.normpath(superpath))
+    return normalized_superpath.endswith(normalized_subpath)
+
+
+# A very simple matcher, returns True iff `where` matches `frame`.
+def match_where_to_frame(
+    where: Where,
+    frame: FrameIR,
+) -> bool:
+    if where.file is not None and not is_subpath(where.file, frame.loc.path):
+        return False
+    if where.function is not None:
+        fn = frame.function
+        if "(" in fn:
+            fn = fn.split("(")[0]
+        if where.function != fn:
+            return False
+    if where.lines is not None:
+        if frame.loc.lineno not in where.get_lines():
+            return False
+    if (
+        where.for_hit_count is not None
+        or where.after_hit_count is not None
+        or where.conditions is not None
+    ):
+        raise NotImplementedError(
+            "!where hit counts and conditions currently unsupported."
+        )
+    return True
+
+def get_active_where_expects(
+    script: DexterScript, step_info: StepIR
+) -> Dict[Where, Tuple[int, List[Value]]]:
+    """Match the script against the step_info, producing a dict that maps each 
!where that matches a stack frame to the
+    index of the (outermost) stack frame that it matches, and if the frame 
that it matches is the current stack frame
+    (i.e. the frame index is 0), also includes a list of every direct child 
!expect node for that !where.
+    """
+    active_where_expects: Dict[Where, Tuple[int, List[Value]]] = {}
+
+    def get_active_wheres(where: Where, scope: Scope):
+        if scope.where:
+            raise NotImplementedError(
+                "Support for nested !where nodes currently unimplemented."
+            )
+        # For this !where, search for the lowest stack frame (e.g. the 
outermost call) that matches it.
+        matching_frame_idx = next(
+            (
+                frame_idx
+                for frame_idx, frame in 
reversed(list(enumerate(step_info.frames)))
+                if match_where_to_frame(where, frame)
+            ),
+            None,
+        )
+        if matching_frame_idx is not None:
+            active_where_expects[where] = (matching_frame_idx, [])
+
+    # As we visit the script nodes in pre-order traversal, we can always 
assume that an expect's parent !where
+    # has already been visited, and thus should have an entry in 
active_where_expects if it is active.
+    def get_active_expects(expect: Expect, expected_value, scope: Scope):
+        assert isinstance(
+            expect, Value
+        ), "Values should be the only type of expect possible!"
+        if (
+            scope.where in active_where_expects
+            and active_where_expects[scope.where][0] == 0
+        ):
+            active_where_expects[scope.where][1].append(expect)
+
+    script.visit_script(visit_where=get_active_wheres, 
visit_expect=get_active_expects)
+
+    return active_where_expects
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/__init__.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/__init__.py
new file mode 100644
index 0000000000000..ff326371181ca
--- /dev/null
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/__init__.py
@@ -0,0 +1,10 @@
+# DExTer : Debugging Experience Tester
+# ~~~~~~   ~         ~~         ~   ~~
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Classes for matching observed debugger output to script expectations.
+"""
+
+from dex.evaluation.RunMatch import DebuggerRunMatch
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py
index e69de29bb2d1d..6f57e37096f14 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py
@@ -0,0 +1,10 @@
+# DExTer : Debugging Experience Tester
+# ~~~~~~   ~         ~~         ~   ~~
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Data and utility methods for Dexter structured scripts.
+"""
+
+from dex.test_script.Script import DexterScript, Scope
\ No newline at end of file
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
index 11f5588fbe710..f366e1a95b4d6 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
@@ -12,6 +12,8 @@
 import pickle
 import shutil
 import platform
+import json
+from typing import Optional, Union
 
 from dex.command.ParseCommand import get_command_infos
 from dex.debugger.Debuggers import run_debugger_subprocess
@@ -21,6 +23,7 @@
     ScriptDebuggerController,
 )
 from dex.dextIR.DextIR import DextIR
+from dex.evaluation import DebuggerRunMatch
 from dex.heuristic import Heuristic
 from dex.test_script.Script import get_dexter_script
 from dex.tools import TestToolBase
@@ -31,10 +34,11 @@
 
 
 class TestCase(object):
-    def __init__(self, context, name, heuristic, error):
+    def __init__(self, context, name, heuristic: Optional[Heuristic]=None, 
error=None, run_match: Optional[DebuggerRunMatch]=None):
         self.context = context
         self.name = name
         self.heuristic = heuristic
+        self.run_match = run_match
         self.error = error
 
     @property
@@ -75,9 +79,11 @@ def __str__(self):
         else:
             error = ""
 
-        try:
+        if self.heuristic is not None:
             summary = self.heuristic.summary_string
-        except AttributeError:
+        elif self.run_match is not None:
+            summary = "\n" + self.run_match.get_metric_output()
+        else:
             summary = "<r>nan/nan (nan)</>"
         return "{}: {}{}\n{}".format(self.name, summary, error, verbose_error)
 
@@ -171,7 +177,7 @@ def _get_steps(self):
         debugger_controller = run_debugger_subprocess(
             debugger_controller, self.context.working_directory.path
         )
-        steps = debugger_controller.step_collection
+        steps: DextIR = debugger_controller.step_collection
         return steps
 
     def _get_results_basename(self, test_name):
@@ -198,6 +204,11 @@ def _get_results_text_path(self, test_name):
         test_results_path = self._get_results_path(test_name)
         return "{}.txt".format(test_results_path)
 
+    def _get_results_json_path(self, test_name):
+        """Returns path results .json file for test denoted by test_name."""
+        test_results_path = self._get_results_path(test_name)
+        return "{}.json".format(test_results_path)
+
     def _get_results_pickle_path(self, test_name):
         """Returns path results .dextIR file for test denoted by test_name."""
         test_results_path = self._get_results_path(test_name)
@@ -216,7 +227,7 @@ def _record_steps(self, test_name, steps):
             with open(output_dextIR_path, "wb") as fp:
                 pickle.dump(steps, fp, protocol=pickle.HIGHEST_PROTOCOL)
 
-    def _record_score(self, test_name, heuristic):
+    def _record_heuristic(self, test_name, heuristic):
         """Write out the test's heuristic score to the results .txt file
         if a results directory has been specified.
         """
@@ -225,6 +236,15 @@ def _record_score(self, test_name, heuristic):
             with open(output_text_path, "a") as fp:
                 self.context.o.auto(heuristic.verbose_output, 
stream=Stream(fp))
 
+    def _record_metrics(self, test_name, run_match: DebuggerRunMatch):
+        """Write out the test's metrics scores to the results .txt file
+        if a results directory has been specified.
+        """
+        if self.context.options.results_directory:
+            output_json_path = self._get_results_json_path(test_name)
+            with open(output_json_path, "w") as fp:
+                json.dump(run_match.get_metric_json_output(), fp)
+
     def _record_test_and_display(self, test_case):
         """Output test case to o stream and record test case internally for
         handling later.
@@ -236,19 +256,29 @@ def _record_failed_test(self, test_name, exception):
         """Instantiate a failed test case with failure exception and
         store internally.
         """
-        test_case = TestCase(self.context, test_name, None, exception)
+        test_case = TestCase(self.context, test_name, error=exception)
         self._record_test_and_display(test_case)
 
-    def _record_successful_test(self, test_name, steps, heuristic):
+    def _record_successful_test_heuristic(self, test_name, steps, heuristic):
         """Instantiate a successful test run, store test for handling later.
         Display verbose output for test case if required.
         """
-        test_case = TestCase(self.context, test_name, heuristic, None)
+        test_case = TestCase(self.context, test_name, heuristic=heuristic)
         self._record_test_and_display(test_case)
         if self.context.options.verbose:
             self.context.o.auto("\n{}\n".format(steps))
             self.context.o.auto(heuristic.verbose_output)
 
+    def _record_successful_test_match(self, test_name, steps, result: 
DebuggerRunMatch):
+        """Instantiate a successful test run, store test for handling later.
+        Display verbose output for test case if required.
+        """
+        test_case = TestCase(self.context, test_name, run_match=result)
+        if self.context.options.verbose:
+            self.context.o.auto(f"\n{steps}\n")
+            self.context.o.auto(f"{result.dump_step_results()}\n")
+        self._record_test_and_display(test_case)
+
     def _run_test(self, test_name):
         """Attempt to run test files specified in options.source_files. Store
         result internally in self._test_cases.
@@ -272,18 +302,17 @@ def _run_test(self, test_name):
                 for step in steps.steps:
                     print("\n".join(step.detailed_print()))
                 return
-            assert (
-                not self.context.options.use_script
-            ), "Evaluation not yet supported with --use-script"
             self._record_steps(test_name, steps)
-            heuristic_score = Heuristic(self.context, steps)
-            self._record_score(test_name, heuristic_score)
+            if self.context.options.use_script:
+                run_match = DebuggerRunMatch(self.context, steps)
+                self._record_metrics(test_name, run_match)
+                self._record_successful_test_match(test_name, steps, run_match)
+            else:
+                heuristic_score = Heuristic(self.context, steps)
+                self._record_heuristic(test_name, heuristic_score)
+                self._record_successful_test_heuristic(test_name, steps, 
heuristic_score)
         except (BuildScriptException, DebuggerException, HeuristicException) 
as e:
             self._record_failed_test(test_name, e)
-            return
-
-        self._record_successful_test(test_name, steps, heuristic_score)
-        return
 
     def _handle_results(self) -> ReturnCode:
         return_code = ReturnCode.OK
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp
 
b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp
new file mode 100644
index 0000000000000..e3a203a396560
--- /dev/null
+++ 
b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp
@@ -0,0 +1,42 @@
+// RUN: %dexter_regression_test_cxx_build %s -o %t
+// RUN: %dexter_regression_test_run --use-script --binary %t -- %s | FileCheck 
%s
+
+// Test evaluation of a simple Dexter test.
+
+// CHECK: basic_evaluate.cpp:
+// CHECK: total_watched_steps: 6
+// CHECK: correct_steps: 4
+// CHECK: incorrect_steps: 2
+// CHECK: missing_var_steps: 1
+// CHECK: unexpected_value_steps: 1
+// CHECK: correct_step_coverage: 66.7% (4/6)
+// CHECK: seen_values: 5
+// CHECK: missing_values: 5
+
+int multiply(int b, int a) {
+    int result = a * b;
+    return result;
+}
+
+int main() {
+    int a = 6;
+    int b = 7;
+    int c = multiply(a, b);
+    return c;
+}
+
+/*
+---
+!where {lines: 18}:
+    !value a: 5 # 1 Incorrect, 1 Missing
+    !value b: 6 # 1 Correct + Seen
+    !value result: [40, 42] # 1 Correct + Seen, 1 Incorrect + Missing
+!where {lines: 25}:
+    !value a: [6, 6] # 1 Correct, 2 Seen
+    !value b: 7 # 1 Correct + Seen
+    !value not_real: 42 # 1 Incorrect + Missing
+!where {lines: 100}: # Never entered
+    !value irrelevant: 10 # 1 Missing
+    !value unseen: 'abc' # 1 Missing
+...
+*/
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp
 
b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp
new file mode 100644
index 0000000000000..31ff9b4392b81
--- /dev/null
+++ 
b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp
@@ -0,0 +1,17 @@
+// RUN: %dexter_regression_test_cxx_build %s -o %t
+// RUN: %dexter_regression_test_run --use-script --binary %t -- %s | FileCheck 
%s
+
+// Test evaluation of a Dexter test with no expects.
+
+// CHECK: evaluate_nothing.cpp:
+// CHECK: No expects found.
+
+int main() {
+    return 0;
+}
+
+/*
+---
+!where {lines: 10}: {} # No expects
+...
+*/

>From dfc55ff3239600b27941eb7e8ec4a6ebe67b7e4f Mon Sep 17 00:00:00 2001
From: Stephen Tozer <[email protected]>
Date: Wed, 20 May 2026 16:09:53 +0100
Subject: [PATCH 2/4] Minor fixup(s)

---
 .../debuginfo-tests/dexter/dex/test_script/__init__.py          | 2 +-
 .../debuginfo-tests/dexter/dex/tools/test/Tool.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py
index 6f57e37096f14..00ec7b66679cb 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py
@@ -7,4 +7,4 @@
 """Data and utility methods for Dexter structured scripts.
 """
 
-from dex.test_script.Script import DexterScript, Scope
\ No newline at end of file
+from dex.test_script.Script import DexterScript, Scope
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
index f366e1a95b4d6..47bd77e3c5c72 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
@@ -13,7 +13,7 @@
 import shutil
 import platform
 import json
-from typing import Optional, Union
+from typing import Optional
 
 from dex.command.ParseCommand import get_command_infos
 from dex.debugger.Debuggers import run_debugger_subprocess

>From b61e114f46bd2e04d63e31a0e0234b87e14ae80e Mon Sep 17 00:00:00 2001
From: Stephen Tozer <[email protected]>
Date: Wed, 20 May 2026 16:13:45 +0100
Subject: [PATCH 3/4] format

---
 .../dexter/dex/evaluation/ExpectMatch.py      |  8 ++-
 .../dexter/dex/evaluation/Metrics.py          | 26 ++++++--
 .../dexter/dex/evaluation/RunMatch.py         | 65 +++++++++++++++----
 .../dexter/dex/evaluation/StateMatch.py       |  1 +
 .../dexter/dex/tools/test/Tool.py             | 13 +++-
 .../scripts/evaluation/basic_evaluate.cpp     | 12 ++--
 .../scripts/evaluation/evaluate_nothing.cpp   |  5 +-
 7 files changed, 98 insertions(+), 32 deletions(-)

diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py
index a8f3f584d1e78..7296f2226af8a 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py
@@ -12,16 +12,19 @@
 from dex.test_script.Nodes import Expect, Value
 
 
-
 class DebuggerExpectMatch:
     """Class that represents the match between a particular expected value for 
an Expect node and the actual debugger
     output corresponding to the watched value for that node."""
+
     def __init__(self, expect: Expect, expected, actual: ValueIR):
         self.expect = expect
         self.expected = expected
         self.actual = actual
         self.actual_result = self.expect.get_variable_result(self.actual)
-        self.match_result = self.expected is not None and str(self.expected) 
== self.actual_result
+        self.match_result = (
+            self.expected is not None and str(self.expected) == 
self.actual_result
+        )
+
 
 def get_expect_match(expect: Expect, expected_values, actual: ValueIR):
     """Given one or more expected values for an Expect node and an actual 
ValueIR, returns a match for the first
@@ -33,4 +36,3 @@ def get_expect_match(expect: Expect, expected_values, actual: 
ValueIR):
         if expect_match.match_result:
             return expect_match
     return DebuggerExpectMatch(expect, None, actual)
-
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py
index 783f30c95b062..a1cbda2d97065 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py
@@ -14,7 +14,7 @@
 
 
 class Metric:
-    def __init__(self, improves_asc = True):
+    def __init__(self, improves_asc=True):
         self.improves_asc = improves_asc
 
     def as_scalar(self) -> float:
@@ -36,8 +36,9 @@ def compare(self, other):
         else:
             return 0
 
+
 class ScalarMetric(Metric):
-    def __init__(self, value: Union[int, float], improves_asc = True):
+    def __init__(self, value: Union[int, float], improves_asc=True):
         self.value = value
         super().__init__(improves_asc)
 
@@ -50,8 +51,9 @@ def aggregate(self, other):
     def __repr__(self):
         return f"{self.value}"
 
+
 class FractionMetric(Metric):
-    def __init__(self, numerator: int, denominator: int, improves_asc = True):
+    def __init__(self, numerator: int, denominator: int, improves_asc=True):
         self.num = numerator
         self.dom = denominator
         super().__init__(improves_asc)
@@ -63,11 +65,14 @@ def as_pct(self) -> float:
         return self.as_scalar() * 100
 
     def aggregate(self, other):
-        return FractionMetric(self.num + other.num, self.dom + other.dom, 
self.improves_asc)
+        return FractionMetric(
+            self.num + other.num, self.dom + other.dom, self.improves_asc
+        )
 
     def __repr__(self):
         return f"{self.as_pct():.1f}% ({self.num}/{self.dom})"
 
+
 def serialize_metric_to_json(metric):
     if isinstance(metric, ScalarMetric):
         return metric.value
@@ -75,7 +80,10 @@ def serialize_metric_to_json(metric):
         return metric.as_pct()
     raise Exception("Invalid metric type!")
 
-def get_variable_metrics(expect: Expect, expected_values: Any, matches: 
List[DebuggerExpectMatch]) -> Dict[str, Metric]:
+
+def get_variable_metrics(
+    expect: Expect, expected_values: Any, matches: List[DebuggerExpectMatch]
+) -> Dict[str, Metric]:
     """Given an Expect node with its expected values and a list of all matches 
for that Expect in a debugger session,
     returns the computed metrics for that Expect node."""
     assert isinstance(expect, Value), "Non-Value expects currently unsupported"
@@ -103,7 +111,9 @@ def get_variable_metrics(expect: Expect, expected_values: 
Any, matches: List[Deb
         # The number of steps where the expected value sequence was observed.
         "correct_steps": ScalarMetric(num_correct_steps),
         # The number of steps which did not match the expected value sequence.
-        "incorrect_steps": ScalarMetric(num_total_steps - num_correct_steps, 
improves_asc=False),
+        "incorrect_steps": ScalarMetric(
+            num_total_steps - num_correct_steps, improves_asc=False
+        ),
         # The number of steps where the watched variable/expression was not 
available in the debugger.
         "missing_var_steps": ScalarMetric(num_missing_var_steps, 
improves_asc=False),
         # The number of steps where the watched variable/expression had a 
value not in the set of expected values.
@@ -115,6 +125,8 @@ def get_variable_metrics(expect: Expect, expected_values: 
Any, matches: List[Deb
         # The number of expected values that were observed at least once.
         "seen_values": ScalarMetric(num_seen_values),
         # The number of expected values that were not observed.
-        "missing_values": ScalarMetric(len(expected_values) - num_seen_values, 
improves_asc=False),
+        "missing_values": ScalarMetric(
+            len(expected_values) - num_seen_values, improves_asc=False
+        ),
     }
     return metrics
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py
index f24629e8b0500..40ebcfe387591 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py
@@ -15,31 +15,48 @@
 
 from dex.dextIR import DextIR, StepIR
 from dex.evaluation.ExpectMatch import DebuggerExpectMatch, get_expect_match
-from dex.evaluation.Metrics import Metric, get_variable_metrics, 
serialize_metric_to_json
+from dex.evaluation.Metrics import (
+    Metric,
+    get_variable_metrics,
+    serialize_metric_to_json,
+)
 from dex.evaluation.StateMatch import get_active_where_expects
 from dex.test_script import DexterScript, Scope
 from dex.test_script.Nodes import Expect, Value
 
+
 class DebuggerStepMatch:
     """Class used to record the match between a DexterScript and a StepIR, 
including the state match, determining which
     script nodes are "active", and the expect matches, which compare the 
debugger's output to the DexterScript's
     expected output."""
+
     def __init__(self, step: StepIR, script: DexterScript):
         self.step = step
         self.script = script
         self.state_match = get_active_where_expects(script, step)
-        expects_to_match = {expect for frame_idx, expects in 
self.state_match.values() for expect in expects}
+        expects_to_match = {
+            expect
+            for frame_idx, expects in self.state_match.values()
+            for expect in expects
+        }
         self.expect_matches: Dict[Expect, DebuggerExpectMatch] = {}
+
         def add_expected_values(expect: Expect, expected_value: Any, scope: 
Scope):
             assert isinstance(expect, Value), "Non-Value expects currently 
unsupported"
             if expect in expects_to_match:
-                self.expect_matches[expect] = get_expect_match(expect, 
expected_value, step.watches[expect.get_watched_expr()])
+                self.expect_matches[expect] = get_expect_match(
+                    expect, expected_value, 
step.watches[expect.get_watched_expr()]
+                )
+
         script.visit_script(visit_expect=add_expected_values)
 
+
 class DebuggerRunMatch(object):
     """Class used to record the complete match of a debugger session and a 
DexterScript. Compares debugger steps to the
     script one-at-a-time, rather than comparing individual variables 
longtitudinally, as there will exist some shared
-    state across evaluation that is updated step-by-step and can be shared 
across variables."""
+    state across evaluation that is updated step-by-step and can be shared 
across variables.
+    """
+
     def __init__(self, context, dext_ir: DextIR):
         self.context = context
         self.dext_ir = dext_ir
@@ -54,10 +71,12 @@ def __init__(self, context, dext_ir: DextIR):
 
         # Gather the expected values for each Expect.
         expected_values = {}
+
         def add_expected_values(expect: Expect, expected_value: Any, scope: 
Scope):
             assert isinstance(expect, Value), "Non-Value expects currently 
unsupported"
             expected_values[expect] = expected_value
             self.per_expect_results[expect] = []
+
         script.visit_script(visit_expect=add_expected_values)
 
         # Then produce all of our step matches.
@@ -67,18 +86,23 @@ def add_expected_values(expect: Expect, expected_value: 
Any, scope: Scope):
         # Then, for each expect, produce the list of results for just that 
variable.
         for step_match in self.step_matches:
             for expect, expect_match in step_match.expect_matches.items():
-                
self.per_expect_results[expect].append((step_match.step.step_index, 
expect_match))
+                self.per_expect_results[expect].append(
+                    (step_match.step.step_index, expect_match)
+                )
 
         # Finally, compare the match results against the expected values to 
produce the metrics.
         for expect, expect_results in self.per_expect_results.items():
             expect_matches = [match for step, match in expect_results]
-            expect_metrics = get_variable_metrics(expect, 
expected_values[expect], expect_matches)
+            expect_metrics = get_variable_metrics(
+                expect, expected_values[expect], expect_matches
+            )
             for metric_name, metric in expect_metrics.items():
                 if metric_name not in self.metrics:
                     self.metrics[metric_name] = metric
                 else:
-                    self.metrics[metric_name] = 
self.metrics[metric_name].aggregate(metric)
-
+                    self.metrics[metric_name] = 
self.metrics[metric_name].aggregate(
+                        metric
+                    )
 
     def dump_step_results(self) -> str:
         result = ""
@@ -91,15 +115,29 @@ def dump_step_results(self) -> str:
             if not frame_active_wheres:
                 result += f"  No active !where nodes.\n"
                 continue
-            frame_active_wheres_list = sorted([(frame_idx, wheres) for 
frame_idx, wheres in frame_active_wheres.items()], key=lambda entry: entry[0])
+            frame_active_wheres_list = sorted(
+                [
+                    (frame_idx, wheres)
+                    for frame_idx, wheres in frame_active_wheres.items()
+                ],
+                key=lambda entry: entry[0],
+            )
             result += f"  Active !where nodes:\n"
             for frame_idx, wheres in frame_active_wheres_list:
                 result += f"    Frame {frame_idx}: [{', '.join(wheres)}]\n"
             if not step_match.expect_matches:
                 continue
             result += f"  Active !expect nodes:\n"
-            matching_expects = [(expect, match) for expect, match in 
step_match.expect_matches.items() if match.match_result]
-            non_matching_expects = [(expect, match) for expect, match in 
step_match.expect_matches.items() if not match.match_result]
+            matching_expects = [
+                (expect, match)
+                for expect, match in step_match.expect_matches.items()
+                if match.match_result
+            ]
+            non_matching_expects = [
+                (expect, match)
+                for expect, match in step_match.expect_matches.items()
+                if not match.match_result
+            ]
             if matching_expects:
                 result += f"    Matching nodes:     [{', 
'.join(f'{expect}={match.actual_result}' for expect, match in 
matching_expects)}]\n"
             if non_matching_expects:
@@ -117,4 +155,7 @@ def get_metric_output(self):
     def get_metric_json_output(self):
         if not self.metrics:
             return "No expects found."
-        return {metric_type: serialize_metric_to_json(metric) for metric_type, 
metric in self.metrics.items()}
+        return {
+            metric_type: serialize_metric_to_json(metric)
+            for metric_type, metric in self.metrics.items()
+        }
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py
index cb532dbc5e198..72d425ad29b75 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py
@@ -49,6 +49,7 @@ def match_where_to_frame(
         )
     return True
 
+
 def get_active_where_expects(
     script: DexterScript, step_info: StepIR
 ) -> Dict[Where, Tuple[int, List[Value]]]:
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
index 47bd77e3c5c72..cc0adcf949217 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
@@ -34,7 +34,14 @@
 
 
 class TestCase(object):
-    def __init__(self, context, name, heuristic: Optional[Heuristic]=None, 
error=None, run_match: Optional[DebuggerRunMatch]=None):
+    def __init__(
+        self,
+        context,
+        name,
+        heuristic: Optional[Heuristic] = None,
+        error=None,
+        run_match: Optional[DebuggerRunMatch] = None,
+    ):
         self.context = context
         self.name = name
         self.heuristic = heuristic
@@ -310,7 +317,9 @@ def _run_test(self, test_name):
             else:
                 heuristic_score = Heuristic(self.context, steps)
                 self._record_heuristic(test_name, heuristic_score)
-                self._record_successful_test_heuristic(test_name, steps, 
heuristic_score)
+                self._record_successful_test_heuristic(
+                    test_name, steps, heuristic_score
+                )
         except (BuildScriptException, DebuggerException, HeuristicException) 
as e:
             self._record_failed_test(test_name, e)
 
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp
 
b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp
index e3a203a396560..113044a6510a8 100644
--- 
a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp
+++ 
b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp
@@ -14,15 +14,15 @@
 // CHECK: missing_values: 5
 
 int multiply(int b, int a) {
-    int result = a * b;
-    return result;
+  int result = a * b;
+  return result;
 }
 
 int main() {
-    int a = 6;
-    int b = 7;
-    int c = multiply(a, b);
-    return c;
+  int a = 6;
+  int b = 7;
+  int c = multiply(a, b);
+  return c;
 }
 
 /*
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp
 
b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp
index 31ff9b4392b81..40456442ebad8 100644
--- 
a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp
+++ 
b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp
@@ -7,11 +7,12 @@
 // CHECK: No expects found.
 
 int main() {
-    return 0;
+  // A comment.
+  return 0;
 }
 
 /*
 ---
-!where {lines: 10}: {} # No expects
+!where {lines: 11}: {} # No expects
 ...
 */

>From 89fd5455d3811fa67a1491aaed59710eb5ea9176 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <[email protected]>
Date: Fri, 22 May 2026 17:30:29 +0100
Subject: [PATCH 4/4] Address review comments

---
 .../debuginfo-tests/dexter/dex/evaluation/Metrics.py   | 10 +++++++++-
 .../debuginfo-tests/dexter/dex/evaluation/RunMatch.py  |  7 ++++---
 .../dexter/dex/evaluation/StateMatch.py                |  4 ++--
 .../debuginfo-tests/dexter/dex/tools/test/Tool.py      | 10 ++++++----
 4 files changed, 21 insertions(+), 10 deletions(-)

diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py
index a1cbda2d97065..551eb153c0710 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py
@@ -46,6 +46,9 @@ def as_scalar(self) -> float:
         return float(self.value)
 
     def aggregate(self, other):
+        assert (
+            self.improves_asc == other.improves_asc
+        ), "Trying to aggregate different metrics?"
         return ScalarMetric(self.value + other.value, self.improves_asc)
 
     def __repr__(self):
@@ -59,12 +62,17 @@ def __init__(self, numerator: int, denominator: int, 
improves_asc=True):
         super().__init__(improves_asc)
 
     def as_scalar(self) -> float:
+        if self.dom == 0:
+            return float("nan")
         return float(self.num) / float(self.dom)
 
     def as_pct(self) -> float:
         return self.as_scalar() * 100
 
     def aggregate(self, other):
+        assert (
+            self.improves_asc == other.improves_asc
+        ), "Trying to aggregate different metrics?"
         return FractionMetric(
             self.num + other.num, self.dom + other.dom, self.improves_asc
         )
@@ -102,7 +110,7 @@ def get_variable_metrics(
             num_missing_var_steps += 1
         else:
             num_unexpected_value_steps += 1
-    num_seen_values = sum(1 for ev in expected_values if ev in 
seen_expected_values)
+    num_seen_values = sum(ev in seen_expected_values for ev in expected_values)
     # And finally produce the metrics map and add the new result to the list.
     metrics = {
         # The number of steps. Though this is not a useful metric in itself, 
it may be useful to see in tandem with
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py
index 40ebcfe387591..adb710dddacde 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py
@@ -52,9 +52,10 @@ def add_expected_values(expect: Expect, expected_value: Any, 
scope: Scope):
 
 
 class DebuggerRunMatch(object):
-    """Class used to record the complete match of a debugger session and a 
DexterScript. Compares debugger steps to the
-    script one-at-a-time, rather than comparing individual variables 
longtitudinally, as there will exist some shared
-    state across evaluation that is updated step-by-step and can be shared 
across variables.
+    """Class used to record the complete match of a debugger session and a 
DexterScript. It is necessary to match
+    step-by-step rather than variable-by-variable (i.e. we evaluate all 
variables for a step before the evaluating the
+    next step), because there are features (yet to be implemented) which allow 
the match of one variable at step N to
+    affect the match of another variable at step N+1, thus we go one step at a 
time.
     """
 
     def __init__(self, context, dext_ir: DextIR):
diff --git 
a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py
index 72d425ad29b75..6008e47ca280e 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py
@@ -16,8 +16,8 @@
 
 
 def is_subpath(subpath: str, superpath: str) -> bool:
-    """Returns True if subpath is not a trailing subpath of superpath, i.e. if 
`superpath` does not end with `subpath`
-    after normalizing both paths."""
+    """Returns True if subpath is a trailing subpath of superpath, i.e. if 
`superpath` ends with `subpath` after
+    normalizing both paths."""
     normalized_subpath: str = os.path.normcase(os.path.normpath(subpath))
     normalized_superpath: str = os.path.normcase(os.path.normpath(superpath))
     return normalized_superpath.endswith(normalized_subpath)
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py 
b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
index cc0adcf949217..0c028773ec56c 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py
@@ -234,7 +234,7 @@ def _record_steps(self, test_name, steps):
             with open(output_dextIR_path, "wb") as fp:
                 pickle.dump(steps, fp, protocol=pickle.HIGHEST_PROTOCOL)
 
-    def _record_heuristic(self, test_name, heuristic):
+    def _record_dex_command_heuristic_score(self, test_name, heuristic):
         """Write out the test's heuristic score to the results .txt file
         if a results directory has been specified.
         """
@@ -243,7 +243,9 @@ def _record_heuristic(self, test_name, heuristic):
             with open(output_text_path, "a") as fp:
                 self.context.o.auto(heuristic.verbose_output, 
stream=Stream(fp))
 
-    def _record_metrics(self, test_name, run_match: DebuggerRunMatch):
+    def _record_structured_script_metric_results(
+        self, test_name, run_match: DebuggerRunMatch
+    ):
         """Write out the test's metrics scores to the results .txt file
         if a results directory has been specified.
         """
@@ -312,11 +314,11 @@ def _run_test(self, test_name):
             self._record_steps(test_name, steps)
             if self.context.options.use_script:
                 run_match = DebuggerRunMatch(self.context, steps)
-                self._record_metrics(test_name, run_match)
+                self._record_structured_script_metric_results(test_name, 
run_match)
                 self._record_successful_test_match(test_name, steps, run_match)
             else:
                 heuristic_score = Heuristic(self.context, steps)
-                self._record_heuristic(test_name, heuristic_score)
+                self._record_dex_command_heuristic_score(test_name, 
heuristic_score)
                 self._record_successful_test_heuristic(
                     test_name, steps, heuristic_score
                 )

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [Dexter] Add basic result evaluation for structured scripts (PR #198803)

Reply via email to