https://github.com/SLTozer updated https://github.com/llvm/llvm-project/pull/198803
>From 35329614671f39c5234effdf2fd9564a3cfdce30 Mon Sep 17 00:00:00 2001 From: Stephen Tozer <[email protected]> Date: Wed, 13 May 2026 17:16:02 +0100 Subject: [PATCH 1/4] [Dexter] Add basic result evaluation for structured scripts This patch adds evaluation for structured scripts, completing the features required to run simple Dexter tests using structured scripts. The basic output from these evaluations is a list of named metrics aggregating the results of evaluating !value nodes. The verbose output gives a per-step summary of the results for each expect node active at that step. Most of the new functionality is in the evaluation/ dir, which has also absorbed some functionality previously stored in the ScriptDebuggerController for matching !where nodes to a debugger StepIR, as this is logic which is common to both managing a debugger session and evaluating the end result. --- .../ScriptDebuggerController.py | 85 +------------ .../dexter/dex/evaluation/ExpectMatch.py | 36 ++++++ .../dexter/dex/evaluation/Metrics.py | 120 ++++++++++++++++++ .../dexter/dex/evaluation/RunMatch.py | 120 ++++++++++++++++++ .../dexter/dex/evaluation/StateMatch.py | 92 ++++++++++++++ .../dexter/dex/evaluation/__init__.py | 10 ++ .../dexter/dex/test_script/__init__.py | 10 ++ .../dexter/dex/tools/test/Tool.py | 63 ++++++--- .../scripts/evaluation/basic_evaluate.cpp | 42 ++++++ .../scripts/evaluation/evaluate_nothing.cpp | 17 +++ 10 files changed, 496 insertions(+), 99 deletions(-) create mode 100644 cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py create mode 100644 cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py create mode 100644 cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py create mode 100644 cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py create mode 100644 cross-project-tests/debuginfo-tests/dexter/dex/evaluation/__init__.py create mode 100644 cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp create mode 100644 cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ScriptDebuggerController.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ScriptDebuggerController.py index c8d5dff4853ca..a130d7ceadedf 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ScriptDebuggerController.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/DebuggerControllers/ScriptDebuggerController.py @@ -9,98 +9,19 @@ from enum import Enum -import os import time -from typing import Dict, List, Tuple from dex.debugger.DebuggerControllers.DebuggerControllerBase import ( DebuggerControllerBase, ) from dex.debugger.DebuggerBase import DebuggerBase from dex.debugger.DAP import DAP -from dex.test_script.Nodes import Expect, Value, Where +from dex.evaluation.StateMatch import get_active_where_expects +from dex.test_script.Nodes import Where from dex.test_script.Script import DexterScript, Scope from dex.tools import Context from dex.utils.Timeout import Timeout -from dex.dextIR import DextIR, FrameIR, StepIR - - -def is_subpath(subpath: str, superpath: str) -> bool: - """Returns True if subpath is not a trailing subpath of superpath, i.e. if `superpath` does not end with `subpath` - after normalizing both paths.""" - normalized_subpath: str = os.path.normcase(os.path.normpath(subpath)) - normalized_superpath: str = os.path.normcase(os.path.normpath(superpath)) - return normalized_superpath.endswith(normalized_subpath) - - -def match_where_to_frame( - where: Where, - frame: FrameIR, -) -> bool: - """A very simple matcher, returns True iff `where` matches `frame`.""" - if where.file is not None and not is_subpath(where.file, frame.loc.path): - return False - if where.function is not None: - fn = frame.function - if "(" in fn: - fn = fn.split("(")[0] - if where.function != fn: - return False - if where.lines is not None: - if frame.loc.lineno not in where.get_lines(): - return False - if ( - where.for_hit_count is not None - or where.after_hit_count is not None - or where.conditions is not None - ): - raise NotImplementedError( - "!where hit counts and conditions currently unsupported." - ) - return True - - -def get_active_where_expects( - script: DexterScript, step_info: StepIR -) -> Dict[Where, Tuple[int, List[Value]]]: - """Match the script against the step_info, producing a dict that maps each !where that matches a stack frame to the - index of the (outermost) stack frame that it matches, and if the frame that it matches is the current stack frame - (i.e. the frame index is 0), also includes a list of every direct child !expect node for that !where. - """ - active_where_expects: Dict[Where, Tuple[int, List[Value]]] = {} - - def get_active_wheres(where: Where, scope: Scope): - if scope.where: - raise NotImplementedError( - "Support for nested !where nodes currently unimplemented." - ) - # For this !where, search for the lowest stack frame (e.g. the outermost call) that matches it. - matching_frame_idx = next( - ( - frame_idx - for frame_idx, frame in reversed(list(enumerate(step_info.frames))) - if match_where_to_frame(where, frame) - ), - None, - ) - if matching_frame_idx is not None: - active_where_expects[where] = (matching_frame_idx, []) - - # As we visit the script nodes in pre-order traversal, we can always assume that an expect's parent !where - # has already been visited, and thus should have an entry in active_where_expects if it is active. - def get_active_expects(expect: Expect, expected_value, scope: Scope): - assert isinstance( - expect, Value - ), "Values should be the only type of expect possible!" - if ( - scope.where in active_where_expects - and active_where_expects[scope.where][0] == 0 - ): - active_where_expects[scope.where][1].append(expect) - - script.visit_script(visit_where=get_active_wheres, visit_expect=get_active_expects) - - return active_where_expects +from dex.dextIR import DextIR, StepIR class DebuggerAction(Enum): diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py new file mode 100644 index 0000000000000..a8f3f584d1e78 --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py @@ -0,0 +1,36 @@ +# DExTer : Debugging Experience Tester +# ~~~~~~ ~ ~~ ~ ~~ +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""Utilities for matching debugger output to script expected values.""" + +from typing import Any, Dict, List, Union + +from dex.dextIR import ValueIR +from dex.test_script.Nodes import Expect, Value + + + +class DebuggerExpectMatch: + """Class that represents the match between a particular expected value for an Expect node and the actual debugger + output corresponding to the watched value for that node.""" + def __init__(self, expect: Expect, expected, actual: ValueIR): + self.expect = expect + self.expected = expected + self.actual = actual + self.actual_result = self.expect.get_variable_result(self.actual) + self.match_result = self.expected is not None and str(self.expected) == self.actual_result + +def get_expect_match(expect: Expect, expected_values, actual: ValueIR): + """Given one or more expected values for an Expect node and an actual ValueIR, returns a match for the first + matching expected values, or for None if there are no matching expected values.""" + if not isinstance(expected_values, list): + expected_values = [expected_values] + for expected_value in expected_values: + expect_match = DebuggerExpectMatch(expect, expected_value, actual) + if expect_match.match_result: + return expect_match + return DebuggerExpectMatch(expect, None, actual) + diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py new file mode 100644 index 0000000000000..783f30c95b062 --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py @@ -0,0 +1,120 @@ +# DExTer : Debugging Experience Tester +# ~~~~~~ ~ ~~ ~ ~~ +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""Produce metric results from the results of a comparison of a DexterScript and debugger output. +""" + +from typing import Any, Dict, List, Union + +from dex.evaluation.ExpectMatch import DebuggerExpectMatch +from dex.test_script.Nodes import Expect, Value + + +class Metric: + def __init__(self, improves_asc = True): + self.improves_asc = improves_asc + + def as_scalar(self) -> float: + raise NotImplementedError() + + def aggregate(self, other): + raise NotImplementedError() + + # Returns 1 if this metric is better than "other", -1 if it worse, and 0 if it is the same. + def compare(self, other): + a = self.as_scalar() + b = other.as_scalar() + if not self.improves_asc: + a, b = b, a + if a > b: + return 1 + elif a < b: + return -1 + else: + return 0 + +class ScalarMetric(Metric): + def __init__(self, value: Union[int, float], improves_asc = True): + self.value = value + super().__init__(improves_asc) + + def as_scalar(self) -> float: + return float(self.value) + + def aggregate(self, other): + return ScalarMetric(self.value + other.value, self.improves_asc) + + def __repr__(self): + return f"{self.value}" + +class FractionMetric(Metric): + def __init__(self, numerator: int, denominator: int, improves_asc = True): + self.num = numerator + self.dom = denominator + super().__init__(improves_asc) + + def as_scalar(self) -> float: + return float(self.num) / float(self.dom) + + def as_pct(self) -> float: + return self.as_scalar() * 100 + + def aggregate(self, other): + return FractionMetric(self.num + other.num, self.dom + other.dom, self.improves_asc) + + def __repr__(self): + return f"{self.as_pct():.1f}% ({self.num}/{self.dom})" + +def serialize_metric_to_json(metric): + if isinstance(metric, ScalarMetric): + return metric.value + elif isinstance(metric, FractionMetric): + return metric.as_pct() + raise Exception("Invalid metric type!") + +def get_variable_metrics(expect: Expect, expected_values: Any, matches: List[DebuggerExpectMatch]) -> Dict[str, Metric]: + """Given an Expect node with its expected values and a list of all matches for that Expect in a debugger session, + returns the computed metrics for that Expect node.""" + assert isinstance(expect, Value), "Non-Value expects currently unsupported" + if not isinstance(expected_values, list): + expected_values = [expected_values] + num_total_steps = len(matches) + seen_expected_values = set() + num_correct_steps = 0 + num_missing_var_steps = 0 + num_unexpected_value_steps = 0 + for match in matches: + if match.match_result: + seen_expected_values.add(match.expected) + num_correct_steps += 1 + elif match.actual_result is None: + num_missing_var_steps += 1 + else: + num_unexpected_value_steps += 1 + num_seen_values = sum(1 for ev in expected_values if ev in seen_expected_values) + # And finally produce the metrics map and add the new result to the list. + metrics = { + # The number of steps. Though this is not a useful metric in itself, it may be useful to see in tandem with + # other variables. + "total_watched_steps": ScalarMetric(num_total_steps), + # The number of steps where the expected value sequence was observed. + "correct_steps": ScalarMetric(num_correct_steps), + # The number of steps which did not match the expected value sequence. + "incorrect_steps": ScalarMetric(num_total_steps - num_correct_steps, improves_asc=False), + # The number of steps where the watched variable/expression was not available in the debugger. + "missing_var_steps": ScalarMetric(num_missing_var_steps, improves_asc=False), + # The number of steps where the watched variable/expression had a value not in the set of expected values. + "unexpected_value_steps": ScalarMetric( + num_unexpected_value_steps, improves_asc=False + ), + # The % of steps where the expected value sequence was observed. + "correct_step_coverage": FractionMetric(num_correct_steps, num_total_steps), + # The number of expected values that were observed at least once. + "seen_values": ScalarMetric(num_seen_values), + # The number of expected values that were not observed. + "missing_values": ScalarMetric(len(expected_values) - num_seen_values, improves_asc=False), + } + return metrics diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py new file mode 100644 index 0000000000000..f24629e8b0500 --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py @@ -0,0 +1,120 @@ +# DExTer : Debugging Experience Tester +# ~~~~~~ ~ ~~ ~ ~~ +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""Classes for matching observed debugger output to script expectations. +""" + +# For each command, there is a set of metrics that can be generated. Metrics across multiple identical commands can be +# aggregated, and each individual metric can be expressed in a scalar form that is considered "better" as it either +# ascends or descends. +from collections import defaultdict +from typing import Any, Dict, List, Tuple + +from dex.dextIR import DextIR, StepIR +from dex.evaluation.ExpectMatch import DebuggerExpectMatch, get_expect_match +from dex.evaluation.Metrics import Metric, get_variable_metrics, serialize_metric_to_json +from dex.evaluation.StateMatch import get_active_where_expects +from dex.test_script import DexterScript, Scope +from dex.test_script.Nodes import Expect, Value + +class DebuggerStepMatch: + """Class used to record the match between a DexterScript and a StepIR, including the state match, determining which + script nodes are "active", and the expect matches, which compare the debugger's output to the DexterScript's + expected output.""" + def __init__(self, step: StepIR, script: DexterScript): + self.step = step + self.script = script + self.state_match = get_active_where_expects(script, step) + expects_to_match = {expect for frame_idx, expects in self.state_match.values() for expect in expects} + self.expect_matches: Dict[Expect, DebuggerExpectMatch] = {} + def add_expected_values(expect: Expect, expected_value: Any, scope: Scope): + assert isinstance(expect, Value), "Non-Value expects currently unsupported" + if expect in expects_to_match: + self.expect_matches[expect] = get_expect_match(expect, expected_value, step.watches[expect.get_watched_expr()]) + script.visit_script(visit_expect=add_expected_values) + +class DebuggerRunMatch(object): + """Class used to record the complete match of a debugger session and a DexterScript. Compares debugger steps to the + script one-at-a-time, rather than comparing individual variables longtitudinally, as there will exist some shared + state across evaluation that is updated step-by-step and can be shared across variables.""" + def __init__(self, context, dext_ir: DextIR): + self.context = context + self.dext_ir = dext_ir + self.metrics: Dict[str, Metric] = {} + self.step_matches: List[DebuggerStepMatch] = [] + self.per_expect_results: Dict[ + Expect, list[Tuple[int, DebuggerExpectMatch]] + ] = {} + + script = self.dext_ir.script + assert script is not None, "Trying to evaluate DextIR without attached script?" + + # Gather the expected values for each Expect. + expected_values = {} + def add_expected_values(expect: Expect, expected_value: Any, scope: Scope): + assert isinstance(expect, Value), "Non-Value expects currently unsupported" + expected_values[expect] = expected_value + self.per_expect_results[expect] = [] + script.visit_script(visit_expect=add_expected_values) + + # Then produce all of our step matches. + for step in self.dext_ir.steps: + self.step_matches.append(DebuggerStepMatch(step, script)) + + # Then, for each expect, produce the list of results for just that variable. + for step_match in self.step_matches: + for expect, expect_match in step_match.expect_matches.items(): + self.per_expect_results[expect].append((step_match.step.step_index, expect_match)) + + # Finally, compare the match results against the expected values to produce the metrics. + for expect, expect_results in self.per_expect_results.items(): + expect_matches = [match for step, match in expect_results] + expect_metrics = get_variable_metrics(expect, expected_values[expect], expect_matches) + for metric_name, metric in expect_metrics.items(): + if metric_name not in self.metrics: + self.metrics[metric_name] = metric + else: + self.metrics[metric_name] = self.metrics[metric_name].aggregate(metric) + + + def dump_step_results(self) -> str: + result = "" + for step_match in self.step_matches: + result += f"Step {step_match.step.step_index}:\n" + result += f" {step_match.step.current_location}\n" + frame_active_wheres = defaultdict(list) + for where, (frame_idx, expects) in step_match.state_match.items(): + frame_active_wheres[frame_idx].append(str(where)) + if not frame_active_wheres: + result += f" No active !where nodes.\n" + continue + frame_active_wheres_list = sorted([(frame_idx, wheres) for frame_idx, wheres in frame_active_wheres.items()], key=lambda entry: entry[0]) + result += f" Active !where nodes:\n" + for frame_idx, wheres in frame_active_wheres_list: + result += f" Frame {frame_idx}: [{', '.join(wheres)}]\n" + if not step_match.expect_matches: + continue + result += f" Active !expect nodes:\n" + matching_expects = [(expect, match) for expect, match in step_match.expect_matches.items() if match.match_result] + non_matching_expects = [(expect, match) for expect, match in step_match.expect_matches.items() if not match.match_result] + if matching_expects: + result += f" Matching nodes: [{', '.join(f'{expect}={match.actual_result}' for expect, match in matching_expects)}]\n" + if non_matching_expects: + result += f" Non-matching nodes: [{', '.join(f'{expect}={match.actual_result}' for expect, match in non_matching_expects)}]\n" + return result + + def get_metric_output(self): + if not self.metrics: + return "No expects found." + lines = [] + for metric_type, metric in self.metrics.items(): + lines.append(f"{metric_type}: {metric}") + return "\n".join(lines) + "\n" + + def get_metric_json_output(self): + if not self.metrics: + return "No expects found." + return {metric_type: serialize_metric_to_json(metric) for metric_type, metric in self.metrics.items()} diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py new file mode 100644 index 0000000000000..cb532dbc5e198 --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py @@ -0,0 +1,92 @@ +# DExTer : Debugging Experience Tester +# ~~~~~~ ~ ~~ ~ ~~ +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""Utilities for matching debugger state, such as the call stack, conditions, or historical state (e.g. breakpoint +hitcounts) to descriptions of expected state in a DexterScript.""" + +import os +from typing import Dict, List, Tuple + +from dex.dextIR import FrameIR, StepIR +from dex.test_script import DexterScript, Scope +from dex.test_script.Nodes import Expect, Value, Where + + +def is_subpath(subpath: str, superpath: str) -> bool: + """Returns True if subpath is not a trailing subpath of superpath, i.e. if `superpath` does not end with `subpath` + after normalizing both paths.""" + normalized_subpath: str = os.path.normcase(os.path.normpath(subpath)) + normalized_superpath: str = os.path.normcase(os.path.normpath(superpath)) + return normalized_superpath.endswith(normalized_subpath) + + +# A very simple matcher, returns True iff `where` matches `frame`. +def match_where_to_frame( + where: Where, + frame: FrameIR, +) -> bool: + if where.file is not None and not is_subpath(where.file, frame.loc.path): + return False + if where.function is not None: + fn = frame.function + if "(" in fn: + fn = fn.split("(")[0] + if where.function != fn: + return False + if where.lines is not None: + if frame.loc.lineno not in where.get_lines(): + return False + if ( + where.for_hit_count is not None + or where.after_hit_count is not None + or where.conditions is not None + ): + raise NotImplementedError( + "!where hit counts and conditions currently unsupported." + ) + return True + +def get_active_where_expects( + script: DexterScript, step_info: StepIR +) -> Dict[Where, Tuple[int, List[Value]]]: + """Match the script against the step_info, producing a dict that maps each !where that matches a stack frame to the + index of the (outermost) stack frame that it matches, and if the frame that it matches is the current stack frame + (i.e. the frame index is 0), also includes a list of every direct child !expect node for that !where. + """ + active_where_expects: Dict[Where, Tuple[int, List[Value]]] = {} + + def get_active_wheres(where: Where, scope: Scope): + if scope.where: + raise NotImplementedError( + "Support for nested !where nodes currently unimplemented." + ) + # For this !where, search for the lowest stack frame (e.g. the outermost call) that matches it. + matching_frame_idx = next( + ( + frame_idx + for frame_idx, frame in reversed(list(enumerate(step_info.frames))) + if match_where_to_frame(where, frame) + ), + None, + ) + if matching_frame_idx is not None: + active_where_expects[where] = (matching_frame_idx, []) + + # As we visit the script nodes in pre-order traversal, we can always assume that an expect's parent !where + # has already been visited, and thus should have an entry in active_where_expects if it is active. + def get_active_expects(expect: Expect, expected_value, scope: Scope): + assert isinstance( + expect, Value + ), "Values should be the only type of expect possible!" + if ( + scope.where in active_where_expects + and active_where_expects[scope.where][0] == 0 + ): + active_where_expects[scope.where][1].append(expect) + + script.visit_script(visit_where=get_active_wheres, visit_expect=get_active_expects) + + return active_where_expects diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/__init__.py b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/__init__.py new file mode 100644 index 0000000000000..ff326371181ca --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/__init__.py @@ -0,0 +1,10 @@ +# DExTer : Debugging Experience Tester +# ~~~~~~ ~ ~~ ~ ~~ +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""Classes for matching observed debugger output to script expectations. +""" + +from dex.evaluation.RunMatch import DebuggerRunMatch diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py b/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py index e69de29bb2d1d..6f57e37096f14 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py @@ -0,0 +1,10 @@ +# DExTer : Debugging Experience Tester +# ~~~~~~ ~ ~~ ~ ~~ +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""Data and utility methods for Dexter structured scripts. +""" + +from dex.test_script.Script import DexterScript, Scope \ No newline at end of file diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py index 11f5588fbe710..f366e1a95b4d6 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py @@ -12,6 +12,8 @@ import pickle import shutil import platform +import json +from typing import Optional, Union from dex.command.ParseCommand import get_command_infos from dex.debugger.Debuggers import run_debugger_subprocess @@ -21,6 +23,7 @@ ScriptDebuggerController, ) from dex.dextIR.DextIR import DextIR +from dex.evaluation import DebuggerRunMatch from dex.heuristic import Heuristic from dex.test_script.Script import get_dexter_script from dex.tools import TestToolBase @@ -31,10 +34,11 @@ class TestCase(object): - def __init__(self, context, name, heuristic, error): + def __init__(self, context, name, heuristic: Optional[Heuristic]=None, error=None, run_match: Optional[DebuggerRunMatch]=None): self.context = context self.name = name self.heuristic = heuristic + self.run_match = run_match self.error = error @property @@ -75,9 +79,11 @@ def __str__(self): else: error = "" - try: + if self.heuristic is not None: summary = self.heuristic.summary_string - except AttributeError: + elif self.run_match is not None: + summary = "\n" + self.run_match.get_metric_output() + else: summary = "<r>nan/nan (nan)</>" return "{}: {}{}\n{}".format(self.name, summary, error, verbose_error) @@ -171,7 +177,7 @@ def _get_steps(self): debugger_controller = run_debugger_subprocess( debugger_controller, self.context.working_directory.path ) - steps = debugger_controller.step_collection + steps: DextIR = debugger_controller.step_collection return steps def _get_results_basename(self, test_name): @@ -198,6 +204,11 @@ def _get_results_text_path(self, test_name): test_results_path = self._get_results_path(test_name) return "{}.txt".format(test_results_path) + def _get_results_json_path(self, test_name): + """Returns path results .json file for test denoted by test_name.""" + test_results_path = self._get_results_path(test_name) + return "{}.json".format(test_results_path) + def _get_results_pickle_path(self, test_name): """Returns path results .dextIR file for test denoted by test_name.""" test_results_path = self._get_results_path(test_name) @@ -216,7 +227,7 @@ def _record_steps(self, test_name, steps): with open(output_dextIR_path, "wb") as fp: pickle.dump(steps, fp, protocol=pickle.HIGHEST_PROTOCOL) - def _record_score(self, test_name, heuristic): + def _record_heuristic(self, test_name, heuristic): """Write out the test's heuristic score to the results .txt file if a results directory has been specified. """ @@ -225,6 +236,15 @@ def _record_score(self, test_name, heuristic): with open(output_text_path, "a") as fp: self.context.o.auto(heuristic.verbose_output, stream=Stream(fp)) + def _record_metrics(self, test_name, run_match: DebuggerRunMatch): + """Write out the test's metrics scores to the results .txt file + if a results directory has been specified. + """ + if self.context.options.results_directory: + output_json_path = self._get_results_json_path(test_name) + with open(output_json_path, "w") as fp: + json.dump(run_match.get_metric_json_output(), fp) + def _record_test_and_display(self, test_case): """Output test case to o stream and record test case internally for handling later. @@ -236,19 +256,29 @@ def _record_failed_test(self, test_name, exception): """Instantiate a failed test case with failure exception and store internally. """ - test_case = TestCase(self.context, test_name, None, exception) + test_case = TestCase(self.context, test_name, error=exception) self._record_test_and_display(test_case) - def _record_successful_test(self, test_name, steps, heuristic): + def _record_successful_test_heuristic(self, test_name, steps, heuristic): """Instantiate a successful test run, store test for handling later. Display verbose output for test case if required. """ - test_case = TestCase(self.context, test_name, heuristic, None) + test_case = TestCase(self.context, test_name, heuristic=heuristic) self._record_test_and_display(test_case) if self.context.options.verbose: self.context.o.auto("\n{}\n".format(steps)) self.context.o.auto(heuristic.verbose_output) + def _record_successful_test_match(self, test_name, steps, result: DebuggerRunMatch): + """Instantiate a successful test run, store test for handling later. + Display verbose output for test case if required. + """ + test_case = TestCase(self.context, test_name, run_match=result) + if self.context.options.verbose: + self.context.o.auto(f"\n{steps}\n") + self.context.o.auto(f"{result.dump_step_results()}\n") + self._record_test_and_display(test_case) + def _run_test(self, test_name): """Attempt to run test files specified in options.source_files. Store result internally in self._test_cases. @@ -272,18 +302,17 @@ def _run_test(self, test_name): for step in steps.steps: print("\n".join(step.detailed_print())) return - assert ( - not self.context.options.use_script - ), "Evaluation not yet supported with --use-script" self._record_steps(test_name, steps) - heuristic_score = Heuristic(self.context, steps) - self._record_score(test_name, heuristic_score) + if self.context.options.use_script: + run_match = DebuggerRunMatch(self.context, steps) + self._record_metrics(test_name, run_match) + self._record_successful_test_match(test_name, steps, run_match) + else: + heuristic_score = Heuristic(self.context, steps) + self._record_heuristic(test_name, heuristic_score) + self._record_successful_test_heuristic(test_name, steps, heuristic_score) except (BuildScriptException, DebuggerException, HeuristicException) as e: self._record_failed_test(test_name, e) - return - - self._record_successful_test(test_name, steps, heuristic_score) - return def _handle_results(self) -> ReturnCode: return_code = ReturnCode.OK diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp new file mode 100644 index 0000000000000..e3a203a396560 --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp @@ -0,0 +1,42 @@ +// RUN: %dexter_regression_test_cxx_build %s -o %t +// RUN: %dexter_regression_test_run --use-script --binary %t -- %s | FileCheck %s + +// Test evaluation of a simple Dexter test. + +// CHECK: basic_evaluate.cpp: +// CHECK: total_watched_steps: 6 +// CHECK: correct_steps: 4 +// CHECK: incorrect_steps: 2 +// CHECK: missing_var_steps: 1 +// CHECK: unexpected_value_steps: 1 +// CHECK: correct_step_coverage: 66.7% (4/6) +// CHECK: seen_values: 5 +// CHECK: missing_values: 5 + +int multiply(int b, int a) { + int result = a * b; + return result; +} + +int main() { + int a = 6; + int b = 7; + int c = multiply(a, b); + return c; +} + +/* +--- +!where {lines: 18}: + !value a: 5 # 1 Incorrect, 1 Missing + !value b: 6 # 1 Correct + Seen + !value result: [40, 42] # 1 Correct + Seen, 1 Incorrect + Missing +!where {lines: 25}: + !value a: [6, 6] # 1 Correct, 2 Seen + !value b: 7 # 1 Correct + Seen + !value not_real: 42 # 1 Incorrect + Missing +!where {lines: 100}: # Never entered + !value irrelevant: 10 # 1 Missing + !value unseen: 'abc' # 1 Missing +... +*/ diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp new file mode 100644 index 0000000000000..31ff9b4392b81 --- /dev/null +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp @@ -0,0 +1,17 @@ +// RUN: %dexter_regression_test_cxx_build %s -o %t +// RUN: %dexter_regression_test_run --use-script --binary %t -- %s | FileCheck %s + +// Test evaluation of a Dexter test with no expects. + +// CHECK: evaluate_nothing.cpp: +// CHECK: No expects found. + +int main() { + return 0; +} + +/* +--- +!where {lines: 10}: {} # No expects +... +*/ >From dfc55ff3239600b27941eb7e8ec4a6ebe67b7e4f Mon Sep 17 00:00:00 2001 From: Stephen Tozer <[email protected]> Date: Wed, 20 May 2026 16:09:53 +0100 Subject: [PATCH 2/4] Minor fixup(s) --- .../debuginfo-tests/dexter/dex/test_script/__init__.py | 2 +- .../debuginfo-tests/dexter/dex/tools/test/Tool.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py b/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py index 6f57e37096f14..00ec7b66679cb 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/test_script/__init__.py @@ -7,4 +7,4 @@ """Data and utility methods for Dexter structured scripts. """ -from dex.test_script.Script import DexterScript, Scope \ No newline at end of file +from dex.test_script.Script import DexterScript, Scope diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py index f366e1a95b4d6..47bd77e3c5c72 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py @@ -13,7 +13,7 @@ import shutil import platform import json -from typing import Optional, Union +from typing import Optional from dex.command.ParseCommand import get_command_infos from dex.debugger.Debuggers import run_debugger_subprocess >From b61e114f46bd2e04d63e31a0e0234b87e14ae80e Mon Sep 17 00:00:00 2001 From: Stephen Tozer <[email protected]> Date: Wed, 20 May 2026 16:13:45 +0100 Subject: [PATCH 3/4] format --- .../dexter/dex/evaluation/ExpectMatch.py | 8 ++- .../dexter/dex/evaluation/Metrics.py | 26 ++++++-- .../dexter/dex/evaluation/RunMatch.py | 65 +++++++++++++++---- .../dexter/dex/evaluation/StateMatch.py | 1 + .../dexter/dex/tools/test/Tool.py | 13 +++- .../scripts/evaluation/basic_evaluate.cpp | 12 ++-- .../scripts/evaluation/evaluate_nothing.cpp | 5 +- 7 files changed, 98 insertions(+), 32 deletions(-) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py index a8f3f584d1e78..7296f2226af8a 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/ExpectMatch.py @@ -12,16 +12,19 @@ from dex.test_script.Nodes import Expect, Value - class DebuggerExpectMatch: """Class that represents the match between a particular expected value for an Expect node and the actual debugger output corresponding to the watched value for that node.""" + def __init__(self, expect: Expect, expected, actual: ValueIR): self.expect = expect self.expected = expected self.actual = actual self.actual_result = self.expect.get_variable_result(self.actual) - self.match_result = self.expected is not None and str(self.expected) == self.actual_result + self.match_result = ( + self.expected is not None and str(self.expected) == self.actual_result + ) + def get_expect_match(expect: Expect, expected_values, actual: ValueIR): """Given one or more expected values for an Expect node and an actual ValueIR, returns a match for the first @@ -33,4 +36,3 @@ def get_expect_match(expect: Expect, expected_values, actual: ValueIR): if expect_match.match_result: return expect_match return DebuggerExpectMatch(expect, None, actual) - diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py index 783f30c95b062..a1cbda2d97065 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py @@ -14,7 +14,7 @@ class Metric: - def __init__(self, improves_asc = True): + def __init__(self, improves_asc=True): self.improves_asc = improves_asc def as_scalar(self) -> float: @@ -36,8 +36,9 @@ def compare(self, other): else: return 0 + class ScalarMetric(Metric): - def __init__(self, value: Union[int, float], improves_asc = True): + def __init__(self, value: Union[int, float], improves_asc=True): self.value = value super().__init__(improves_asc) @@ -50,8 +51,9 @@ def aggregate(self, other): def __repr__(self): return f"{self.value}" + class FractionMetric(Metric): - def __init__(self, numerator: int, denominator: int, improves_asc = True): + def __init__(self, numerator: int, denominator: int, improves_asc=True): self.num = numerator self.dom = denominator super().__init__(improves_asc) @@ -63,11 +65,14 @@ def as_pct(self) -> float: return self.as_scalar() * 100 def aggregate(self, other): - return FractionMetric(self.num + other.num, self.dom + other.dom, self.improves_asc) + return FractionMetric( + self.num + other.num, self.dom + other.dom, self.improves_asc + ) def __repr__(self): return f"{self.as_pct():.1f}% ({self.num}/{self.dom})" + def serialize_metric_to_json(metric): if isinstance(metric, ScalarMetric): return metric.value @@ -75,7 +80,10 @@ def serialize_metric_to_json(metric): return metric.as_pct() raise Exception("Invalid metric type!") -def get_variable_metrics(expect: Expect, expected_values: Any, matches: List[DebuggerExpectMatch]) -> Dict[str, Metric]: + +def get_variable_metrics( + expect: Expect, expected_values: Any, matches: List[DebuggerExpectMatch] +) -> Dict[str, Metric]: """Given an Expect node with its expected values and a list of all matches for that Expect in a debugger session, returns the computed metrics for that Expect node.""" assert isinstance(expect, Value), "Non-Value expects currently unsupported" @@ -103,7 +111,9 @@ def get_variable_metrics(expect: Expect, expected_values: Any, matches: List[Deb # The number of steps where the expected value sequence was observed. "correct_steps": ScalarMetric(num_correct_steps), # The number of steps which did not match the expected value sequence. - "incorrect_steps": ScalarMetric(num_total_steps - num_correct_steps, improves_asc=False), + "incorrect_steps": ScalarMetric( + num_total_steps - num_correct_steps, improves_asc=False + ), # The number of steps where the watched variable/expression was not available in the debugger. "missing_var_steps": ScalarMetric(num_missing_var_steps, improves_asc=False), # The number of steps where the watched variable/expression had a value not in the set of expected values. @@ -115,6 +125,8 @@ def get_variable_metrics(expect: Expect, expected_values: Any, matches: List[Deb # The number of expected values that were observed at least once. "seen_values": ScalarMetric(num_seen_values), # The number of expected values that were not observed. - "missing_values": ScalarMetric(len(expected_values) - num_seen_values, improves_asc=False), + "missing_values": ScalarMetric( + len(expected_values) - num_seen_values, improves_asc=False + ), } return metrics diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py index f24629e8b0500..40ebcfe387591 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py @@ -15,31 +15,48 @@ from dex.dextIR import DextIR, StepIR from dex.evaluation.ExpectMatch import DebuggerExpectMatch, get_expect_match -from dex.evaluation.Metrics import Metric, get_variable_metrics, serialize_metric_to_json +from dex.evaluation.Metrics import ( + Metric, + get_variable_metrics, + serialize_metric_to_json, +) from dex.evaluation.StateMatch import get_active_where_expects from dex.test_script import DexterScript, Scope from dex.test_script.Nodes import Expect, Value + class DebuggerStepMatch: """Class used to record the match between a DexterScript and a StepIR, including the state match, determining which script nodes are "active", and the expect matches, which compare the debugger's output to the DexterScript's expected output.""" + def __init__(self, step: StepIR, script: DexterScript): self.step = step self.script = script self.state_match = get_active_where_expects(script, step) - expects_to_match = {expect for frame_idx, expects in self.state_match.values() for expect in expects} + expects_to_match = { + expect + for frame_idx, expects in self.state_match.values() + for expect in expects + } self.expect_matches: Dict[Expect, DebuggerExpectMatch] = {} + def add_expected_values(expect: Expect, expected_value: Any, scope: Scope): assert isinstance(expect, Value), "Non-Value expects currently unsupported" if expect in expects_to_match: - self.expect_matches[expect] = get_expect_match(expect, expected_value, step.watches[expect.get_watched_expr()]) + self.expect_matches[expect] = get_expect_match( + expect, expected_value, step.watches[expect.get_watched_expr()] + ) + script.visit_script(visit_expect=add_expected_values) + class DebuggerRunMatch(object): """Class used to record the complete match of a debugger session and a DexterScript. Compares debugger steps to the script one-at-a-time, rather than comparing individual variables longtitudinally, as there will exist some shared - state across evaluation that is updated step-by-step and can be shared across variables.""" + state across evaluation that is updated step-by-step and can be shared across variables. + """ + def __init__(self, context, dext_ir: DextIR): self.context = context self.dext_ir = dext_ir @@ -54,10 +71,12 @@ def __init__(self, context, dext_ir: DextIR): # Gather the expected values for each Expect. expected_values = {} + def add_expected_values(expect: Expect, expected_value: Any, scope: Scope): assert isinstance(expect, Value), "Non-Value expects currently unsupported" expected_values[expect] = expected_value self.per_expect_results[expect] = [] + script.visit_script(visit_expect=add_expected_values) # Then produce all of our step matches. @@ -67,18 +86,23 @@ def add_expected_values(expect: Expect, expected_value: Any, scope: Scope): # Then, for each expect, produce the list of results for just that variable. for step_match in self.step_matches: for expect, expect_match in step_match.expect_matches.items(): - self.per_expect_results[expect].append((step_match.step.step_index, expect_match)) + self.per_expect_results[expect].append( + (step_match.step.step_index, expect_match) + ) # Finally, compare the match results against the expected values to produce the metrics. for expect, expect_results in self.per_expect_results.items(): expect_matches = [match for step, match in expect_results] - expect_metrics = get_variable_metrics(expect, expected_values[expect], expect_matches) + expect_metrics = get_variable_metrics( + expect, expected_values[expect], expect_matches + ) for metric_name, metric in expect_metrics.items(): if metric_name not in self.metrics: self.metrics[metric_name] = metric else: - self.metrics[metric_name] = self.metrics[metric_name].aggregate(metric) - + self.metrics[metric_name] = self.metrics[metric_name].aggregate( + metric + ) def dump_step_results(self) -> str: result = "" @@ -91,15 +115,29 @@ def dump_step_results(self) -> str: if not frame_active_wheres: result += f" No active !where nodes.\n" continue - frame_active_wheres_list = sorted([(frame_idx, wheres) for frame_idx, wheres in frame_active_wheres.items()], key=lambda entry: entry[0]) + frame_active_wheres_list = sorted( + [ + (frame_idx, wheres) + for frame_idx, wheres in frame_active_wheres.items() + ], + key=lambda entry: entry[0], + ) result += f" Active !where nodes:\n" for frame_idx, wheres in frame_active_wheres_list: result += f" Frame {frame_idx}: [{', '.join(wheres)}]\n" if not step_match.expect_matches: continue result += f" Active !expect nodes:\n" - matching_expects = [(expect, match) for expect, match in step_match.expect_matches.items() if match.match_result] - non_matching_expects = [(expect, match) for expect, match in step_match.expect_matches.items() if not match.match_result] + matching_expects = [ + (expect, match) + for expect, match in step_match.expect_matches.items() + if match.match_result + ] + non_matching_expects = [ + (expect, match) + for expect, match in step_match.expect_matches.items() + if not match.match_result + ] if matching_expects: result += f" Matching nodes: [{', '.join(f'{expect}={match.actual_result}' for expect, match in matching_expects)}]\n" if non_matching_expects: @@ -117,4 +155,7 @@ def get_metric_output(self): def get_metric_json_output(self): if not self.metrics: return "No expects found." - return {metric_type: serialize_metric_to_json(metric) for metric_type, metric in self.metrics.items()} + return { + metric_type: serialize_metric_to_json(metric) + for metric_type, metric in self.metrics.items() + } diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py index cb532dbc5e198..72d425ad29b75 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py @@ -49,6 +49,7 @@ def match_where_to_frame( ) return True + def get_active_where_expects( script: DexterScript, step_info: StepIR ) -> Dict[Where, Tuple[int, List[Value]]]: diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py index 47bd77e3c5c72..cc0adcf949217 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py @@ -34,7 +34,14 @@ class TestCase(object): - def __init__(self, context, name, heuristic: Optional[Heuristic]=None, error=None, run_match: Optional[DebuggerRunMatch]=None): + def __init__( + self, + context, + name, + heuristic: Optional[Heuristic] = None, + error=None, + run_match: Optional[DebuggerRunMatch] = None, + ): self.context = context self.name = name self.heuristic = heuristic @@ -310,7 +317,9 @@ def _run_test(self, test_name): else: heuristic_score = Heuristic(self.context, steps) self._record_heuristic(test_name, heuristic_score) - self._record_successful_test_heuristic(test_name, steps, heuristic_score) + self._record_successful_test_heuristic( + test_name, steps, heuristic_score + ) except (BuildScriptException, DebuggerException, HeuristicException) as e: self._record_failed_test(test_name, e) diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp index e3a203a396560..113044a6510a8 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/basic_evaluate.cpp @@ -14,15 +14,15 @@ // CHECK: missing_values: 5 int multiply(int b, int a) { - int result = a * b; - return result; + int result = a * b; + return result; } int main() { - int a = 6; - int b = 7; - int c = multiply(a, b); - return c; + int a = 6; + int b = 7; + int c = multiply(a, b); + return c; } /* diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp index 31ff9b4392b81..40456442ebad8 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/scripts/evaluation/evaluate_nothing.cpp @@ -7,11 +7,12 @@ // CHECK: No expects found. int main() { - return 0; + // A comment. + return 0; } /* --- -!where {lines: 10}: {} # No expects +!where {lines: 11}: {} # No expects ... */ >From 89fd5455d3811fa67a1491aaed59710eb5ea9176 Mon Sep 17 00:00:00 2001 From: Stephen Tozer <[email protected]> Date: Fri, 22 May 2026 17:30:29 +0100 Subject: [PATCH 4/4] Address review comments --- .../debuginfo-tests/dexter/dex/evaluation/Metrics.py | 10 +++++++++- .../debuginfo-tests/dexter/dex/evaluation/RunMatch.py | 7 ++++--- .../dexter/dex/evaluation/StateMatch.py | 4 ++-- .../debuginfo-tests/dexter/dex/tools/test/Tool.py | 10 ++++++---- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py index a1cbda2d97065..551eb153c0710 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/Metrics.py @@ -46,6 +46,9 @@ def as_scalar(self) -> float: return float(self.value) def aggregate(self, other): + assert ( + self.improves_asc == other.improves_asc + ), "Trying to aggregate different metrics?" return ScalarMetric(self.value + other.value, self.improves_asc) def __repr__(self): @@ -59,12 +62,17 @@ def __init__(self, numerator: int, denominator: int, improves_asc=True): super().__init__(improves_asc) def as_scalar(self) -> float: + if self.dom == 0: + return float("nan") return float(self.num) / float(self.dom) def as_pct(self) -> float: return self.as_scalar() * 100 def aggregate(self, other): + assert ( + self.improves_asc == other.improves_asc + ), "Trying to aggregate different metrics?" return FractionMetric( self.num + other.num, self.dom + other.dom, self.improves_asc ) @@ -102,7 +110,7 @@ def get_variable_metrics( num_missing_var_steps += 1 else: num_unexpected_value_steps += 1 - num_seen_values = sum(1 for ev in expected_values if ev in seen_expected_values) + num_seen_values = sum(ev in seen_expected_values for ev in expected_values) # And finally produce the metrics map and add the new result to the list. metrics = { # The number of steps. Though this is not a useful metric in itself, it may be useful to see in tandem with diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py index 40ebcfe387591..adb710dddacde 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/RunMatch.py @@ -52,9 +52,10 @@ def add_expected_values(expect: Expect, expected_value: Any, scope: Scope): class DebuggerRunMatch(object): - """Class used to record the complete match of a debugger session and a DexterScript. Compares debugger steps to the - script one-at-a-time, rather than comparing individual variables longtitudinally, as there will exist some shared - state across evaluation that is updated step-by-step and can be shared across variables. + """Class used to record the complete match of a debugger session and a DexterScript. It is necessary to match + step-by-step rather than variable-by-variable (i.e. we evaluate all variables for a step before the evaluating the + next step), because there are features (yet to be implemented) which allow the match of one variable at step N to + affect the match of another variable at step N+1, thus we go one step at a time. """ def __init__(self, context, dext_ir: DextIR): diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py index 72d425ad29b75..6008e47ca280e 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/evaluation/StateMatch.py @@ -16,8 +16,8 @@ def is_subpath(subpath: str, superpath: str) -> bool: - """Returns True if subpath is not a trailing subpath of superpath, i.e. if `superpath` does not end with `subpath` - after normalizing both paths.""" + """Returns True if subpath is a trailing subpath of superpath, i.e. if `superpath` ends with `subpath` after + normalizing both paths.""" normalized_subpath: str = os.path.normcase(os.path.normpath(subpath)) normalized_superpath: str = os.path.normcase(os.path.normpath(superpath)) return normalized_superpath.endswith(normalized_subpath) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py index cc0adcf949217..0c028773ec56c 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/tools/test/Tool.py @@ -234,7 +234,7 @@ def _record_steps(self, test_name, steps): with open(output_dextIR_path, "wb") as fp: pickle.dump(steps, fp, protocol=pickle.HIGHEST_PROTOCOL) - def _record_heuristic(self, test_name, heuristic): + def _record_dex_command_heuristic_score(self, test_name, heuristic): """Write out the test's heuristic score to the results .txt file if a results directory has been specified. """ @@ -243,7 +243,9 @@ def _record_heuristic(self, test_name, heuristic): with open(output_text_path, "a") as fp: self.context.o.auto(heuristic.verbose_output, stream=Stream(fp)) - def _record_metrics(self, test_name, run_match: DebuggerRunMatch): + def _record_structured_script_metric_results( + self, test_name, run_match: DebuggerRunMatch + ): """Write out the test's metrics scores to the results .txt file if a results directory has been specified. """ @@ -312,11 +314,11 @@ def _run_test(self, test_name): self._record_steps(test_name, steps) if self.context.options.use_script: run_match = DebuggerRunMatch(self.context, steps) - self._record_metrics(test_name, run_match) + self._record_structured_script_metric_results(test_name, run_match) self._record_successful_test_match(test_name, steps, run_match) else: heuristic_score = Heuristic(self.context, steps) - self._record_heuristic(test_name, heuristic_score) + self._record_dex_command_heuristic_score(test_name, heuristic_score) self._record_successful_test_heuristic( test_name, steps, heuristic_score ) _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
