This is an automated email from the ASF dual-hosted git repository.
amoghdesai pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new caafc704462 Defining large file detection heuristics to automatically
run full tests (#58575)
caafc704462 is described below
commit caafc704462cc15bedde67ef416d870ceabd13e8
Author: Amogh Desai <[email protected]>
AuthorDate: Sat Nov 22 15:53:23 2025 +0530
Defining large file detection heuristics to automatically run full tests
(#58575)
---
.../src/airflow_breeze/utils/selective_checks.py | 70 +++++++++++++++++
dev/breeze/tests/test_selective_checks.py | 90 ++++++++++++++++++++++
2 files changed, 160 insertions(+)
diff --git a/dev/breeze/src/airflow_breeze/utils/selective_checks.py
b/dev/breeze/src/airflow_breeze/utils/selective_checks.py
index a70e4dab1ef..f98bde361de 100644
--- a/dev/breeze/src/airflow_breeze/utils/selective_checks.py
+++ b/dev/breeze/src/airflow_breeze/utils/selective_checks.py
@@ -583,6 +583,8 @@ class SelectiveChecks:
):
get_console().print("[warning]Running full set of tests because
tests/utils changed[/]")
return True
+ if self._is_large_enough_pr():
+ return True
if FULL_TESTS_NEEDED_LABEL in self._pr_labels:
get_console().print(
"[warning]Full tests needed because "
@@ -591,6 +593,74 @@ class SelectiveChecks:
return True
return False
+ def _is_large_enough_pr(self) -> bool:
+ """
+ Check if PR is large enough to run full tests.
+
+ The heuristics are based on number of files changed and total lines
changed,
+ while excluding generated files which can be ignored.
+ """
+ FILE_THRESHOLD = 25
+ LINE_THRESHOLD = 500
+
+ if not self._files:
+ return False
+
+ exclude_patterns = [
+ r"/newsfragments/",
+ r"^uv\.lock$",
+ r"pnpm-lock\.yaml$",
+ r"package-lock\.json$",
+ ]
+
+ relevant_files = [
+ f for f in self._files if not any(re.search(pattern, f) for
pattern in exclude_patterns)
+ ]
+
+ files_changed = len(relevant_files)
+ if files_changed >= FILE_THRESHOLD:
+ get_console().print(
+ f"[warning]Running full set of tests because PR touches
{files_changed} files "
+ f"(≥25 threshold)[/]"
+ )
+ return True
+
+ if not self._commit_ref:
+ get_console().print("[warning]Cannot determine if PR is big
enough, skipping the check[/]")
+ return False
+
+ try:
+ result = run_command(
+ ["git", "diff", "--numstat",
f"{self._commit_ref}^...{self._commit_ref}"] + relevant_files,
+ capture_output=True,
+ text=True,
+ cwd=AIRFLOW_ROOT_PATH,
+ check=False,
+ )
+
+ if result.returncode == 0:
+ total_lines = 0
+ for line in result.stdout.strip().split("\n"):
+ if line:
+ parts = line.split("\t")
+ if len(parts) >= 2:
+ try:
+ additions = int(parts[0])
+ deletions = int(parts[1])
+ total_lines += additions + deletions
+ except ValueError:
+ pass
+ if total_lines >= LINE_THRESHOLD:
+ get_console().print(
+ f"[warning]Running full set of tests because PR
changes {total_lines} lines "
+ f"in {files_changed} files[/]"
+ )
+ return True
+ except Exception:
+ pass
+
+ return False
+
@cached_property
def python_versions(self) -> list[str]:
if self.all_versions:
diff --git a/dev/breeze/tests/test_selective_checks.py
b/dev/breeze/tests/test_selective_checks.py
index 8ab2f6f66cd..f1b8c4e3e4a 100644
--- a/dev/breeze/tests/test_selective_checks.py
+++ b/dev/breeze/tests/test_selective_checks.py
@@ -2945,3 +2945,93 @@ def
test_provider_dependency_bump_check_in_optional_dependencies(mock_run_comman
github_event=GithubEvents.PULL_REQUEST,
default_branch="main",
).provider_dependency_bump
+
+
[email protected](
+ ("files", "expected_outputs"),
+ [
+ pytest.param(
+ (
+ "airflow-core/src/airflow/models/dag.py",
+ "airflow-core/src/airflow/models/taskinstance.py",
+ "airflow-core/tests/unit/models/test_dag.py",
+ "task-sdk/src/airflow/sdk/definitions/dag.py",
+ "task-sdk/tests/task_sdk/definitions/test_dag.py",
+ ),
+ {
+ "full-tests-needed": "false",
+ },
+ id="Small PR with 5 files changed",
+ ),
+ pytest.param(
+ tuple(f"airflow-core/src/airflow/models/file{i}.py" for i in
range(30)),
+ {
+ "full-tests-needed": "true",
+ },
+ id="Large PR with 30 files changed",
+ ),
+ pytest.param(
+ (
+ "uv.lock",
+ "package-lock.json",
+ ),
+ {
+ "full-tests-needed": "false",
+ },
+ id="PR with only lock files changed",
+ ),
+ ],
+)
+def test_large_pr_by_file_count(files, expected_outputs: dict[str, str]):
+ stderr = SelectiveChecks(
+ files=files,
+ commit_ref=NEUTRAL_COMMIT,
+ github_event=GithubEvents.PULL_REQUEST,
+ default_branch="main",
+ )
+ assert_outputs_are_printed(expected_outputs, str(stderr))
+
+
[email protected](
+ ("files", "git_diff_output", "expected_outputs"),
+ [
+ pytest.param(
+ tuple(f"airflow-core/src/airflow/models/file{i}.py" for i in
range(10)),
+ "\n".join([f"10\t10\tairflow-core/src/airflow/models/file{i}.py"
for i in range(10)]),
+ {
+ "full-tests-needed": "false",
+ },
+ id="Small PR with 200 lines changed",
+ ),
+ pytest.param(
+ tuple(f"airflow-core/src/airflow/models/file{i}.py" for i in
range(10)),
+ "\n".join([f"30\t30\tairflow-core/src/airflow/models/file{i}.py"
for i in range(10)]),
+ {
+ "full-tests-needed": "true",
+ },
+ id="PR with 600 lines changed",
+ ),
+ pytest.param(
+ ("airflow-core/src/airflow/configuration.py",),
+ "500\t500\tairflow-core/src/airflow/configuration.py",
+ {
+ "full-tests-needed": "true",
+ },
+ id="Single large file with 1000 lines",
+ ),
+ ],
+)
+def test_large_pr_by_line_count(files, git_diff_output, expected_outputs:
dict[str, str]):
+ with patch("airflow_breeze.utils.selective_checks.run_command") as
mock_run:
+ mock_result = Mock()
+ mock_result.returncode = 0
+ mock_result.stdout = git_diff_output
+ mock_run.return_value = mock_result
+
+ stderr = SelectiveChecks(
+ files=files,
+ commit_ref=NEUTRAL_COMMIT,
+ github_event=GithubEvents.PULL_REQUEST,
+ default_branch="main",
+ )
+ assert_outputs_are_printed(expected_outputs, str(stderr))