This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch v3-2-test
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/v3-2-test by this push:
new 9d2c5889788 Remove large-PR heuristic from selective checks (#68109)
(#68145)
9d2c5889788 is described below
commit 9d2c58897883c14a9eccf09086284991ab54da92
Author: Jarek Potiuk <[email protected]>
AuthorDate: Sun Jun 7 00:12:00 2026 +0200
Remove large-PR heuristic from selective checks (#68109) (#68145)
Selective checks forced the full test matrix whenever a PR touched 25+
files or changed 500+ lines of production code. This size-based heuristic
made large but low-risk PRs run the entire CI suite, so drop it.
Full tests are still triggered by the targeted rules (env/API/provider
file changes, the "full tests needed" label, missing commit ref, etc.).
The *_PRODUCTION_FILES groups are kept — they still feed the SAST/SCA
scan target (run_python_scans / run_javascript_scans).
(cherry picked from commit a218626923bfe1fecb2f266a5cf8b41b9f7a4a60)
Co-authored-by: Claude Opus 4.8 (1M context) <[email protected]>
---
.../src/airflow_breeze/utils/selective_checks.py | 97 +------------
dev/breeze/tests/test_selective_checks.py | 150 ---------------------
2 files changed, 2 insertions(+), 245 deletions(-)
diff --git a/dev/breeze/src/airflow_breeze/utils/selective_checks.py
b/dev/breeze/src/airflow_breeze/utils/selective_checks.py
index 4b2269b9f8c..3dc1ac0bfe4 100644
--- a/dev/breeze/src/airflow_breeze/utils/selective_checks.py
+++ b/dev/breeze/src/airflow_breeze/utils/selective_checks.py
@@ -201,9 +201,8 @@ CI_FILE_GROUP_MATCHES: HashableDict[FileGroupForCi] =
HashableDict(
FileGroupForCi.PYTHON_PRODUCTION_FILES: [
# Production Python source the runtime ships — excludes tests,
docs,
# dev tooling, and generated files within those trees. Used by
- # `run_python_scans` (SAST/SCA target) and the line-threshold check
- # in `_is_large_enough_pr` to decide whether a PR's diff is large
- # enough to force the full test matrix.
+ # `run_python_scans` (SAST/SCA target) to decide whether the
security
+ # scans need to run.
r"^airflow-core/src/airflow/(?!.*/(?:openapi-gen|i18n/locales)/).*\.py$",
r"^task-sdk/src/airflow/(?!.*_generated\.py$).*\.py$",
r"^airflow-ctl/src/airflowctl/(?!.*generated\.py$).*\.py$",
@@ -701,8 +700,6 @@ class SelectiveChecks:
):
console_print("[warning]Running full set of tests because
tests/utils changed[/]")
return True
- if self._is_large_enough_pr():
- return True
if FULL_TESTS_NEEDED_LABEL in self._pr_labels:
console_print(
"[warning]Full tests needed because "
@@ -711,96 +708,6 @@ class SelectiveChecks:
return True
return False
- def _is_large_enough_pr(self) -> bool:
- """
- Check if PR is large enough to run full tests.
-
- The heuristics are based on number of files changed and total lines
changed,
- while excluding generated files which can be ignored.
-
- The line-count check (``LINE_THRESHOLD``) only counts lines in
production-code
- files — tests, docs, newsfragments, generated files, translations, dev
tooling,
- and similar low-risk paths do not contribute to the line count. A
1000-line test
- or docs PR is not the same shape of risk as a 1000-line change to
scheduler
- code, and only the latter should trigger the full test matrix.
- """
- FILE_THRESHOLD = 25
- LINE_THRESHOLD = 500
-
- if not self._files:
- return False
-
- exclude_patterns = [
- r"/newsfragments/",
- r"^uv\.lock$",
- r"pnpm-lock\.yaml$",
- r"package-lock\.json$",
- ]
-
- relevant_files = [
- f for f in self._files if not any(re.search(pattern, f) for
pattern in exclude_patterns)
- ]
-
- files_changed = len(relevant_files)
- if files_changed >= FILE_THRESHOLD:
- console_print(
- f"[warning]Running full set of tests because PR touches
{files_changed} files "
- f"(≥25 threshold)[/]"
- )
- return True
-
- if not self._commit_ref:
- console_print("[warning]Cannot determine if PR is big enough,
skipping the check[/]")
- return False
-
- # The line-count gate only counts churn in production code. We compose
- # the existing `*_PRODUCTION_FILES` and helm groups rather than rolling
- # a bespoke pattern set, so the definition of "production code" stays
- # in lockstep with the rest of CI (e.g. SAST scans targeted by
- # `run_python_scans` / `run_javascript_scans`).
- production_files = list(
- dict.fromkeys(
- self._matching_files(FileGroupForCi.PYTHON_PRODUCTION_FILES,
CI_FILE_GROUP_MATCHES)
- +
self._matching_files(FileGroupForCi.JAVASCRIPT_PRODUCTION_FILES,
CI_FILE_GROUP_MATCHES)
- + self._matching_files(FileGroupForCi.HELM_FILES,
CI_FILE_GROUP_MATCHES)
- )
- )
- if not production_files:
- return False
-
- try:
- result = run_command(
- ["git", "diff", "--numstat",
f"{self._commit_ref}^...{self._commit_ref}"] + production_files,
- capture_output=True,
- text=True,
- cwd=AIRFLOW_ROOT_PATH,
- check=False,
- )
-
- if result.returncode == 0:
- total_lines = 0
- for line in result.stdout.strip().split("\n"):
- if line:
- parts = line.split("\t")
- if len(parts) >= 2:
- try:
- additions = int(parts[0])
- deletions = int(parts[1])
- total_lines += additions + deletions
- except ValueError:
- pass
- if total_lines >= LINE_THRESHOLD:
- console_print(
- f"[warning]Running full set of tests because PR
changes {total_lines} lines "
- f"of production code in {len(production_files)}
file(s) "
- f"(of {files_changed} relevant file(s))[/]"
- )
- return True
- except Exception:
- pass
-
- return False
-
@cached_property
def python_versions(self) -> list[str]:
if self.all_versions:
diff --git a/dev/breeze/tests/test_selective_checks.py
b/dev/breeze/tests/test_selective_checks.py
index 5b62fc074a1..430122acac8 100644
--- a/dev/breeze/tests/test_selective_checks.py
+++ b/dev/breeze/tests/test_selective_checks.py
@@ -3470,156 +3470,6 @@ def
test_provider_dependency_bump_check_in_optional_dependencies(mock_run_comman
).provider_dependency_bump
[email protected](
- ("files", "expected_outputs"),
- [
- pytest.param(
- (
- "airflow-core/src/airflow/models/dag.py",
- "airflow-core/src/airflow/models/taskinstance.py",
- "airflow-core/tests/unit/models/test_dag.py",
- "task-sdk/src/airflow/sdk/definitions/dag.py",
- "task-sdk/tests/task_sdk/definitions/test_dag.py",
- ),
- {
- "full-tests-needed": "false",
- },
- id="Small PR with 5 files changed",
- ),
- pytest.param(
- tuple(f"airflow-core/src/airflow/models/file{i}.py" for i in
range(30)),
- {
- "full-tests-needed": "true",
- },
- id="Large PR with 30 files changed",
- ),
- pytest.param(
- (
- "uv.lock",
- "package-lock.json",
- ),
- {
- "full-tests-needed": "false",
- },
- id="PR with only lock files changed",
- ),
- ],
-)
-def test_large_pr_by_file_count(files, expected_outputs: dict[str, str]):
- stderr = SelectiveChecks(
- files=files,
- commit_ref=NEUTRAL_COMMIT,
- github_event=GithubEvents.PULL_REQUEST,
- default_branch="main",
- )
- assert_outputs_are_printed(expected_outputs, str(stderr))
-
-
[email protected](
- ("files", "git_diff_output", "expected_outputs"),
- [
- pytest.param(
- tuple(f"airflow-core/src/airflow/models/file{i}.py" for i in
range(10)),
- "\n".join([f"10\t10\tairflow-core/src/airflow/models/file{i}.py"
for i in range(10)]),
- {
- "full-tests-needed": "false",
- },
- id="Small PR with 200 lines changed",
- ),
- pytest.param(
- tuple(f"airflow-core/src/airflow/models/file{i}.py" for i in
range(10)),
- "\n".join([f"30\t30\tairflow-core/src/airflow/models/file{i}.py"
for i in range(10)]),
- {
- "full-tests-needed": "true",
- },
- id="PR with 600 lines changed",
- ),
- pytest.param(
- ("airflow-core/src/airflow/configuration.py",),
- "500\t500\tairflow-core/src/airflow/configuration.py",
- {
- "full-tests-needed": "true",
- },
- id="Single large file with 1000 lines",
- ),
- pytest.param(
- tuple(f"airflow-core/tests/unit/models/test_file{i}.py" for i in
range(10)),
-
"\n".join([f"100\t100\tairflow-core/tests/unit/models/test_file{i}.py" for i in
range(10)]),
- {
- "full-tests-needed": "false",
- },
- id="Large test-only PR (2000 lines) does not trigger full tests",
- ),
- pytest.param(
- ("docs/index.rst",
"airflow-core/docs/security/security_model.rst"),
-
"600\t600\tdocs/index.rst\n400\t400\tairflow-core/docs/security/security_model.rst",
- {
- "full-tests-needed": "false",
- },
- id="Large docs-only PR does not trigger full tests",
- ),
- pytest.param(
- (
- "airflow-core/src/airflow/ui/openapi-gen/queries/queries.ts",
- "airflow-ctl/src/airflowctl/api/datamodels/generated.py",
- "task-sdk/src/airflow/sdk/api/datamodels/_generated.py",
- ),
- "\n".join(
- [
-
"400\t400\tairflow-core/src/airflow/ui/openapi-gen/queries/queries.ts",
-
"400\t400\tairflow-ctl/src/airflowctl/api/datamodels/generated.py",
-
"400\t400\ttask-sdk/src/airflow/sdk/api/datamodels/_generated.py",
- ]
- ),
- {
- "full-tests-needed": "false",
- },
- id="Generated-only large PR does not trigger full tests",
- ),
- # In mixed PRs the production-file filter narrows the `git diff
--numstat`
- # call to the production paths, so the mocked stdout below only
contains
- # the production-file rows (mirroring what real git would return for
- # that filtered argument list).
- pytest.param(
- tuple(
- [f"airflow-core/src/airflow/models/file{i}.py" for i in
range(5)]
- + [f"airflow-core/tests/unit/models/test_file{i}.py" for i in
range(5)]
- ),
- "\n".join([f"60\t60\tairflow-core/src/airflow/models/file{i}.py"
for i in range(5)]),
- {
- "full-tests-needed": "true",
- },
- id="Mixed PR with 600 production lines triggers (test lines
excluded but prod >= 500)",
- ),
- pytest.param(
- tuple(
- [f"airflow-core/src/airflow/models/file{i}.py" for i in
range(5)]
- + [f"airflow-core/tests/unit/models/test_file{i}.py" for i in
range(5)]
- ),
- "\n".join([f"20\t20\tairflow-core/src/airflow/models/file{i}.py"
for i in range(5)]),
- {
- "full-tests-needed": "false",
- },
- id="Mixed PR with only 200 production lines does not trigger (test
lines excluded)",
- ),
- ],
-)
-def test_large_pr_by_line_count(files, git_diff_output, expected_outputs:
dict[str, str]):
- with patch("airflow_breeze.utils.selective_checks.run_command") as
mock_run:
- mock_result = Mock()
- mock_result.returncode = 0
- mock_result.stdout = git_diff_output
- mock_run.return_value = mock_result
-
- stderr = SelectiveChecks(
- files=files,
- commit_ref=NEUTRAL_COMMIT,
- github_event=GithubEvents.PULL_REQUEST,
- default_branch="main",
- )
- assert_outputs_are_printed(expected_outputs, str(stderr))
-
-
@patch("airflow_breeze.utils.selective_checks.run_command")
def test_common_compat_changed_with_next_version_passes(mock_run_command):
"""Test that check passes when common.compat changes and other provider
has '# use next version'."""