This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new ac0d5b3dbe Improve detection of when breeze CI image needs rebuilding
(#33603)
ac0d5b3dbe is described below
commit ac0d5b3dbe731605af38018ce7ce970ffded539a
Author: Jarek Potiuk <[email protected]>
AuthorDate: Tue Aug 22 13:27:03 2023 +0200
Improve detection of when breeze CI image needs rebuilding (#33603)
* Improve detection of when breeze CI image needs rebuilding
Previously we have been using provider.yaml file modification as
a sign that the docker image needs rebuilding when starting image.
However just modification of provider.yaml file is not a sign
that the image needs rebuilding. The image needs rebuilding when
provider dependencies changed, but there are many more reasons why
provider.yaml file changed - especially recently provider.yaml
file contains much more information and dependencies are only part
of it. Provider.yaml files can also be modified by release manager
wnen documentation is prepared, but none of the documentation
change is a reason for rebuilding the image.
This PR optimize the check for image building introducing two
step process:
* first we check if provider.yaml files changed
* if they did, we regenerate provider dependencies by manully
running the pre-commit script
* then provider_dependencies.json is used instead of all providers
to determine if the image needs rebuilding
This has several nice side effects:
* the list of files that have been modified displayed to the
user is potentially much smaller (no provider.yaml files)
* provider_dependencies.json is regenereated automatically when
you run any breeze command, which means that you do not have
to have pre-commit installed to regenerate it
* the notification "image needs rebuilding" will be printed less
frequently to the user - only when it is really needed
* preparing provider documentation in CI will not trigger
image rebuilding (which might occasionally fail in such case
especially when we bring back a provider from long suspension
like it happened in #33574
* Update dev/breeze/src/airflow_breeze/commands/developer_commands.py
---
.../airflow_breeze/commands/ci_image_commands.py | 6 +-
.../airflow_breeze/commands/developer_commands.py | 3 +
dev/breeze/src/airflow_breeze/global_constants.py | 2 +-
.../src/airflow_breeze/params/build_ci_params.py | 1 +
.../src/airflow_breeze/params/shell_params.py | 1 +
.../src/airflow_breeze/utils/md5_build_check.py | 70 +++++++++++++++++-----
.../pre_commit_update_providers_dependencies.py | 34 ++++++++---
7 files changed, 92 insertions(+), 25 deletions(-)
diff --git a/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py
b/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py
index 124d6ca318..2eecbfff3c 100644
--- a/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py
+++ b/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py
@@ -455,7 +455,10 @@ def should_we_run_the_build(build_ci_params:
BuildCiParams) -> bool:
# We import those locally so that click autocomplete works
from inputimeout import TimeoutOccurred
- if not
md5sum_check_if_build_is_needed(md5sum_cache_dir=build_ci_params.md5sum_cache_dir):
+ if not md5sum_check_if_build_is_needed(
+ md5sum_cache_dir=build_ci_params.md5sum_cache_dir,
+
skip_provider_dependencies_check=build_ci_params.skip_provider_dependencies_check,
+ ):
return False
try:
answer = user_confirm(
@@ -631,6 +634,7 @@ def rebuild_or_pull_ci_image_if_needed(command_params:
ShellParams | BuildCiPara
image_tag=command_params.image_tag,
platform=command_params.platform,
force_build=command_params.force_build,
+
skip_provider_dependencies_check=command_params.skip_provider_dependencies_check,
)
if command_params.image_tag is not None and command_params.image_tag !=
"latest":
return_code, message = run_pull_image(
diff --git a/dev/breeze/src/airflow_breeze/commands/developer_commands.py
b/dev/breeze/src/airflow_breeze/commands/developer_commands.py
index 40fc095478..671ced7d5a 100644
--- a/dev/breeze/src/airflow_breeze/commands/developer_commands.py
+++ b/dev/breeze/src/airflow_breeze/commands/developer_commands.py
@@ -515,6 +515,9 @@ def static_checks(
force_build=force_build,
image_tag=image_tag,
github_repository=github_repository,
+ # for static checks we do not want to regenerate dependencies before
pre-commits are run
+ # we want the pre-commit to do it for us (and detect the case the
dependencies are updated)
+ skip_provider_dependencies_check=True,
)
if not skip_image_check:
rebuild_or_pull_ci_image_if_needed(command_params=build_params)
diff --git a/dev/breeze/src/airflow_breeze/global_constants.py
b/dev/breeze/src/airflow_breeze/global_constants.py
index c0740801d5..6e07343aec 100644
--- a/dev/breeze/src/airflow_breeze/global_constants.py
+++ b/dev/breeze/src/airflow_breeze/global_constants.py
@@ -323,13 +323,13 @@ FILES_FOR_REBUILD_CHECK = [
"setup.cfg",
"Dockerfile.ci",
".dockerignore",
+ "generated/provider_dependencies.json",
"scripts/docker/common.sh",
"scripts/docker/install_additional_dependencies.sh",
"scripts/docker/install_airflow.sh",
"scripts/docker/install_airflow_dependencies_from_branch_tip.sh",
"scripts/docker/install_from_docker_context_files.sh",
"scripts/docker/install_mysql.sh",
- *ALL_PROVIDER_YAML_FILES,
]
ENABLED_SYSTEMS = ""
diff --git a/dev/breeze/src/airflow_breeze/params/build_ci_params.py
b/dev/breeze/src/airflow_breeze/params/build_ci_params.py
index 1f0663ec9f..1f49f0597b 100644
--- a/dev/breeze/src/airflow_breeze/params/build_ci_params.py
+++ b/dev/breeze/src/airflow_breeze/params/build_ci_params.py
@@ -37,6 +37,7 @@ class BuildCiParams(CommonBuildParams):
airflow_pre_cached_pip_packages: bool = True
force_build: bool = False
eager_upgrade_additional_requirements: str = ""
+ skip_provider_dependencies_check: bool = False
@property
def airflow_version(self):
diff --git a/dev/breeze/src/airflow_breeze/params/shell_params.py
b/dev/breeze/src/airflow_breeze/params/shell_params.py
index a70a404d0e..99bc8c4d3c 100644
--- a/dev/breeze/src/airflow_breeze/params/shell_params.py
+++ b/dev/breeze/src/airflow_breeze/params/shell_params.py
@@ -125,6 +125,7 @@ class ShellParams:
celery_flower: bool = False
only_min_version_update: bool = False
regenerate_missing_docs: bool = False
+ skip_provider_dependencies_check: bool = False
def clone_with_test(self, test_type: str) -> ShellParams:
new_params = deepcopy(self)
diff --git a/dev/breeze/src/airflow_breeze/utils/md5_build_check.py
b/dev/breeze/src/airflow_breeze/utils/md5_build_check.py
index 235fbf2249..54b46c9916 100644
--- a/dev/breeze/src/airflow_breeze/utils/md5_build_check.py
+++ b/dev/breeze/src/airflow_breeze/utils/md5_build_check.py
@@ -20,11 +20,14 @@ Utilities to check - with MD5 - whether files have been
modified since the last
from __future__ import annotations
import hashlib
+import os
+import sys
from pathlib import Path
-from airflow_breeze.global_constants import FILES_FOR_REBUILD_CHECK
+from airflow_breeze.global_constants import ALL_PROVIDER_YAML_FILES,
FILES_FOR_REBUILD_CHECK
from airflow_breeze.utils.console import get_console
from airflow_breeze.utils.path_utils import AIRFLOW_SOURCES_ROOT
+from airflow_breeze.utils.run_utils import run_command
def check_md5checksum_in_cache_modified(file_hash: str, cache_path: Path,
update: bool) -> bool:
@@ -59,8 +62,19 @@ def generate_md5(filename, file_size: int = 65536):
return hash_md5.hexdigest()
+def check_md5_sum_for_file(file_to_check: str, md5sum_cache_dir: Path, update:
bool):
+ file_to_get_md5 = AIRFLOW_SOURCES_ROOT / file_to_check
+ md5_checksum = generate_md5(file_to_get_md5)
+ sub_dir_name = file_to_get_md5.parts[-2]
+ actual_file_name = file_to_get_md5.parts[-1]
+ cache_file_name = Path(md5sum_cache_dir, sub_dir_name + "-" +
actual_file_name + ".md5sum")
+ file_content = md5_checksum + " " + str(file_to_get_md5) + "\n"
+ is_modified = check_md5checksum_in_cache_modified(file_content,
cache_file_name, update=update)
+ return is_modified
+
+
def calculate_md5_checksum_for_files(
- md5sum_cache_dir: Path, update: bool = False
+ md5sum_cache_dir: Path, update: bool = False,
skip_provider_dependencies_check: bool = False
) -> tuple[list[str], list[str]]:
"""
Calculates checksums for all interesting files and stores the hashes in
the md5sum_cache_dir.
@@ -68,35 +82,63 @@ def calculate_md5_checksum_for_files(
:param md5sum_cache_dir: directory where to store cached information
:param update: whether to update the hashes
+ :param skip_provider_dependencies_check: whether to skip regeneration of
the provider dependencies
:return: Tuple of two lists: modified and not-modified files
"""
not_modified_files = []
modified_files = []
- for calculate_md5_file in FILES_FOR_REBUILD_CHECK:
- file_to_get_md5 = AIRFLOW_SOURCES_ROOT / calculate_md5_file
- md5_checksum = generate_md5(file_to_get_md5)
- sub_dir_name = file_to_get_md5.parts[-2]
- actual_file_name = file_to_get_md5.parts[-1]
- cache_file_name = Path(md5sum_cache_dir, sub_dir_name + "-" +
actual_file_name + ".md5sum")
- file_content = md5_checksum + " " + str(file_to_get_md5) + "\n"
- is_modified = check_md5checksum_in_cache_modified(file_content,
cache_file_name, update=update)
+ if not skip_provider_dependencies_check:
+ modified_provider_yaml_files = []
+ for file in ALL_PROVIDER_YAML_FILES:
+ # Only check provider yaml files once and save the result
immediately.
+ # If we need to regenerate the dependencies and they are not
modified then
+ # all is fine and we can save checksums for the new files
+ if check_md5_sum_for_file(file, md5sum_cache_dir, True):
+ modified_provider_yaml_files.append(file)
+ if modified_provider_yaml_files:
+ get_console().print(
+ "[info]Attempting to generate provider dependencies. "
+ "Provider yaml files changed since last check:[/]"
+ )
+ get_console().print(
+ [os.fspath(file.relative_to(AIRFLOW_SOURCES_ROOT)) for file in
modified_provider_yaml_files]
+ )
+ # Regenerate provider_dependencies.json
+ run_command(
+ [
+ sys.executable,
+ os.fspath(
+ AIRFLOW_SOURCES_ROOT
+ / "scripts"
+ / "ci"
+ / "pre_commit"
+ / "pre_commit_update_providers_dependencies.py"
+ ),
+ ],
+ cwd=AIRFLOW_SOURCES_ROOT,
+ )
+ for file in FILES_FOR_REBUILD_CHECK:
+ is_modified = check_md5_sum_for_file(file, md5sum_cache_dir, update)
if is_modified:
- modified_files.append(calculate_md5_file)
+ modified_files.append(file)
else:
- not_modified_files.append(calculate_md5_file)
+ not_modified_files.append(file)
return modified_files, not_modified_files
-def md5sum_check_if_build_is_needed(md5sum_cache_dir: Path) -> bool:
+def md5sum_check_if_build_is_needed(md5sum_cache_dir: Path,
skip_provider_dependencies_check: bool) -> bool:
"""
Checks if build is needed based on whether important files were modified.
:param md5sum_cache_dir: directory where cached md5 sums are stored
+ :param skip_provider_dependencies_check: whether to skip regeneration of
the provider dependencies
:return: True if build is needed.
"""
build_needed = False
- modified_files, not_modified_files =
calculate_md5_checksum_for_files(md5sum_cache_dir, update=False)
+ modified_files, not_modified_files = calculate_md5_checksum_for_files(
+ md5sum_cache_dir, update=False,
skip_provider_dependencies_check=skip_provider_dependencies_check
+ )
if modified_files:
get_console().print(
f"[warning]The following important files are modified in
{AIRFLOW_SOURCES_ROOT} "
diff --git a/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py
b/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py
index e2fc0ca713..b63a60b14b 100755
--- a/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py
+++ b/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py
@@ -209,12 +209,28 @@ if __name__ == "__main__":
console.print("[red]Errors found during verification. Exiting!")
console.print()
sys.exit(1)
-
DEPENDENCIES_JSON_FILE_PATH.write_text(json.dumps(unique_sorted_dependencies,
indent=2) + "\n")
- console.print(
- f"[yellow]If you see changes to the {DEPENDENCIES_JSON_FILE_PATH} file
- "
- f"do not modify the file manually. Let pre-commit do the job!"
- )
- console.print()
- console.print("[green]Verification complete! Success!\n")
- console.print(f"Written {DEPENDENCIES_JSON_FILE_PATH}")
- console.print()
+ old_dependencies = DEPENDENCIES_JSON_FILE_PATH.read_text()
+ new_dependencies = json.dumps(unique_sorted_dependencies, indent=2) + "\n"
+ if new_dependencies != old_dependencies:
+
DEPENDENCIES_JSON_FILE_PATH.write_text(json.dumps(unique_sorted_dependencies,
indent=2) + "\n")
+ if os.environ.get("CI"):
+ console.print()
+ console.print(f"[info]Written {DEPENDENCIES_JSON_FILE_PATH}")
+ console.print(
+ f"[yellow]You will need to run breeze locally and commit "
+
f"{DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)}!\n"
+ )
+ console.print()
+ else:
+ console.print()
+ console.print(
+ f"[yellow]Regenerated new dependencies. Please commit "
+
f"{DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)}!\n"
+ )
+ console.print(f"[info]Written {DEPENDENCIES_JSON_FILE_PATH}")
+ console.print()
+ else:
+ console.print(
+ "[green]No need to regenerate dependencies!\n[/]"
+ f"The
{DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)} is up to
date!\n"
+ )