(airflow) branch main updated: Defining large file detection heuristics to automatically run full tests (#58575)

amoghdesai Sat, 22 Nov 2025 02:23:39 -0800

This is an automated email from the ASF dual-hosted git repository.

amoghdesai pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git



The following commit(s) were added to refs/heads/main by this push:
     new caafc704462 Defining large file detection heuristics to automatically 
run full tests (#58575)
caafc704462 is described below

commit caafc704462cc15bedde67ef416d870ceabd13e8
Author: Amogh Desai <[email protected]>
AuthorDate: Sat Nov 22 15:53:23 2025 +0530

    Defining large file detection heuristics to automatically run full tests 
(#58575)
---
 .../src/airflow_breeze/utils/selective_checks.py   | 70 +++++++++++++++++
 dev/breeze/tests/test_selective_checks.py          | 90 ++++++++++++++++++++++
 2 files changed, 160 insertions(+)

diff --git a/dev/breeze/src/airflow_breeze/utils/selective_checks.py 
b/dev/breeze/src/airflow_breeze/utils/selective_checks.py
index a70e4dab1ef..f98bde361de 100644
--- a/dev/breeze/src/airflow_breeze/utils/selective_checks.py
+++ b/dev/breeze/src/airflow_breeze/utils/selective_checks.py
@@ -583,6 +583,8 @@ class SelectiveChecks:
         ):
             get_console().print("[warning]Running full set of tests because 
tests/utils changed[/]")
             return True
+        if self._is_large_enough_pr():
+            return True
         if FULL_TESTS_NEEDED_LABEL in self._pr_labels:
             get_console().print(
                 "[warning]Full tests needed because "
@@ -591,6 +593,74 @@ class SelectiveChecks:
             return True
         return False
 
+    def _is_large_enough_pr(self) -> bool:
+        """
+        Check if PR is large enough to run full tests.
+
+        The heuristics are based on number of files changed and total lines 
changed,
+        while excluding generated files which can be ignored.
+        """
+        FILE_THRESHOLD = 25
+        LINE_THRESHOLD = 500
+
+        if not self._files:
+            return False
+
+        exclude_patterns = [
+            r"/newsfragments/",
+            r"^uv\.lock$",
+            r"pnpm-lock\.yaml$",
+            r"package-lock\.json$",
+        ]
+
+        relevant_files = [
+            f for f in self._files if not any(re.search(pattern, f) for 
pattern in exclude_patterns)
+        ]
+
+        files_changed = len(relevant_files)
+        if files_changed >= FILE_THRESHOLD:
+            get_console().print(
+                f"[warning]Running full set of tests because PR touches 
{files_changed} files "
+                f"(≥25 threshold)[/]"
+            )
+            return True
+
+        if not self._commit_ref:
+            get_console().print("[warning]Cannot determine if PR is big 
enough, skipping the check[/]")
+            return False
+
+        try:
+            result = run_command(
+                ["git", "diff", "--numstat", 
f"{self._commit_ref}^...{self._commit_ref}"] + relevant_files,
+                capture_output=True,
+                text=True,
+                cwd=AIRFLOW_ROOT_PATH,
+                check=False,
+            )
+
+            if result.returncode == 0:
+                total_lines = 0
+                for line in result.stdout.strip().split("\n"):
+                    if line:
+                        parts = line.split("\t")
+                        if len(parts) >= 2:
+                            try:
+                                additions = int(parts[0])
+                                deletions = int(parts[1])
+                                total_lines += additions + deletions
+                            except ValueError:
+                                pass
+                if total_lines >= LINE_THRESHOLD:
+                    get_console().print(
+                        f"[warning]Running full set of tests because PR 
changes {total_lines} lines "
+                        f"in {files_changed} files[/]"
+                    )
+                    return True
+        except Exception:
+            pass
+
+        return False
+
     @cached_property
     def python_versions(self) -> list[str]:
         if self.all_versions:
diff --git a/dev/breeze/tests/test_selective_checks.py 
b/dev/breeze/tests/test_selective_checks.py
index 8ab2f6f66cd..f1b8c4e3e4a 100644
--- a/dev/breeze/tests/test_selective_checks.py
+++ b/dev/breeze/tests/test_selective_checks.py
@@ -2945,3 +2945,93 @@ def 
test_provider_dependency_bump_check_in_optional_dependencies(mock_run_comman
             github_event=GithubEvents.PULL_REQUEST,
             default_branch="main",
         ).provider_dependency_bump
+
+
[email protected](
+    ("files", "expected_outputs"),
+    [
+        pytest.param(
+            (
+                "airflow-core/src/airflow/models/dag.py",
+                "airflow-core/src/airflow/models/taskinstance.py",
+                "airflow-core/tests/unit/models/test_dag.py",
+                "task-sdk/src/airflow/sdk/definitions/dag.py",
+                "task-sdk/tests/task_sdk/definitions/test_dag.py",
+            ),
+            {
+                "full-tests-needed": "false",
+            },
+            id="Small PR with 5 files changed",
+        ),
+        pytest.param(
+            tuple(f"airflow-core/src/airflow/models/file{i}.py" for i in 
range(30)),
+            {
+                "full-tests-needed": "true",
+            },
+            id="Large PR with 30 files changed",
+        ),
+        pytest.param(
+            (
+                "uv.lock",
+                "package-lock.json",
+            ),
+            {
+                "full-tests-needed": "false",
+            },
+            id="PR with only lock files changed",
+        ),
+    ],
+)
+def test_large_pr_by_file_count(files, expected_outputs: dict[str, str]):
+    stderr = SelectiveChecks(
+        files=files,
+        commit_ref=NEUTRAL_COMMIT,
+        github_event=GithubEvents.PULL_REQUEST,
+        default_branch="main",
+    )
+    assert_outputs_are_printed(expected_outputs, str(stderr))
+
+
[email protected](
+    ("files", "git_diff_output", "expected_outputs"),
+    [
+        pytest.param(
+            tuple(f"airflow-core/src/airflow/models/file{i}.py" for i in 
range(10)),
+            "\n".join([f"10\t10\tairflow-core/src/airflow/models/file{i}.py" 
for i in range(10)]),
+            {
+                "full-tests-needed": "false",
+            },
+            id="Small PR with 200 lines changed",
+        ),
+        pytest.param(
+            tuple(f"airflow-core/src/airflow/models/file{i}.py" for i in 
range(10)),
+            "\n".join([f"30\t30\tairflow-core/src/airflow/models/file{i}.py" 
for i in range(10)]),
+            {
+                "full-tests-needed": "true",
+            },
+            id="PR with 600 lines changed",
+        ),
+        pytest.param(
+            ("airflow-core/src/airflow/configuration.py",),
+            "500\t500\tairflow-core/src/airflow/configuration.py",
+            {
+                "full-tests-needed": "true",
+            },
+            id="Single large file with 1000 lines",
+        ),
+    ],
+)
+def test_large_pr_by_line_count(files, git_diff_output, expected_outputs: 
dict[str, str]):
+    with patch("airflow_breeze.utils.selective_checks.run_command") as 
mock_run:
+        mock_result = Mock()
+        mock_result.returncode = 0
+        mock_result.stdout = git_diff_output
+        mock_run.return_value = mock_result
+
+        stderr = SelectiveChecks(
+            files=files,
+            commit_ref=NEUTRAL_COMMIT,
+            github_event=GithubEvents.PULL_REQUEST,
+            default_branch="main",
+        )
+        assert_outputs_are_printed(expected_outputs, str(stderr))

(airflow) branch main updated: Defining large file detection heuristics to automatically run full tests (#58575)

Reply via email to