This is an automated email from the ASF dual-hosted git repository. potiuk pushed a commit to branch v2-7-test in repository https://gitbox.apache.org/repos/asf/airflow.git
commit 6005da9ba5b1bb7ce004a69a6d8be75d43e12a53 Author: Jarek Potiuk <[email protected]> AuthorDate: Tue Aug 22 13:27:03 2023 +0200 Improve detection of when breeze CI image needs rebuilding (#33603) * Improve detection of when breeze CI image needs rebuilding Previously we have been using provider.yaml file modification as a sign that the docker image needs rebuilding when starting image. However just modification of provider.yaml file is not a sign that the image needs rebuilding. The image needs rebuilding when provider dependencies changed, but there are many more reasons why provider.yaml file changed - especially recently provider.yaml file contains much more information and dependencies are only part of it. Provider.yaml files can also be modified by release manager wnen documentation is prepared, but none of the documentation change is a reason for rebuilding the image. This PR optimize the check for image building introducing two step process: * first we check if provider.yaml files changed * if they did, we regenerate provider dependencies by manully running the pre-commit script * then provider_dependencies.json is used instead of all providers to determine if the image needs rebuilding This has several nice side effects: * the list of files that have been modified displayed to the user is potentially much smaller (no provider.yaml files) * provider_dependencies.json is regenereated automatically when you run any breeze command, which means that you do not have to have pre-commit installed to regenerate it * the notification "image needs rebuilding" will be printed less frequently to the user - only when it is really needed * preparing provider documentation in CI will not trigger image rebuilding (which might occasionally fail in such case especially when we bring back a provider from long suspension like it happened in #33574 * Update dev/breeze/src/airflow_breeze/commands/developer_commands.py (cherry picked from commit ac0d5b3dbe731605af38018ce7ce970ffded539a) --- .../airflow_breeze/commands/ci_image_commands.py | 6 +- .../airflow_breeze/commands/developer_commands.py | 3 + dev/breeze/src/airflow_breeze/global_constants.py | 2 +- .../src/airflow_breeze/params/build_ci_params.py | 1 + .../src/airflow_breeze/params/shell_params.py | 1 + .../src/airflow_breeze/utils/md5_build_check.py | 72 +++++++++++++++++----- .../pre_commit_update_providers_dependencies.py | 34 +++++++--- 7 files changed, 93 insertions(+), 26 deletions(-) diff --git a/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py b/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py index 124d6ca318..2eecbfff3c 100644 --- a/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py +++ b/dev/breeze/src/airflow_breeze/commands/ci_image_commands.py @@ -455,7 +455,10 @@ def should_we_run_the_build(build_ci_params: BuildCiParams) -> bool: # We import those locally so that click autocomplete works from inputimeout import TimeoutOccurred - if not md5sum_check_if_build_is_needed(md5sum_cache_dir=build_ci_params.md5sum_cache_dir): + if not md5sum_check_if_build_is_needed( + md5sum_cache_dir=build_ci_params.md5sum_cache_dir, + skip_provider_dependencies_check=build_ci_params.skip_provider_dependencies_check, + ): return False try: answer = user_confirm( @@ -631,6 +634,7 @@ def rebuild_or_pull_ci_image_if_needed(command_params: ShellParams | BuildCiPara image_tag=command_params.image_tag, platform=command_params.platform, force_build=command_params.force_build, + skip_provider_dependencies_check=command_params.skip_provider_dependencies_check, ) if command_params.image_tag is not None and command_params.image_tag != "latest": return_code, message = run_pull_image( diff --git a/dev/breeze/src/airflow_breeze/commands/developer_commands.py b/dev/breeze/src/airflow_breeze/commands/developer_commands.py index f29584bb32..4a694cac2e 100644 --- a/dev/breeze/src/airflow_breeze/commands/developer_commands.py +++ b/dev/breeze/src/airflow_breeze/commands/developer_commands.py @@ -516,6 +516,9 @@ def static_checks( force_build=force_build, image_tag=image_tag, github_repository=github_repository, + # for static checks we do not want to regenerate dependencies before pre-commits are run + # we want the pre-commit to do it for us (and detect the case the dependencies are updated) + skip_provider_dependencies_check=True, ) if not skip_image_check: rebuild_or_pull_ci_image_if_needed(command_params=build_params) diff --git a/dev/breeze/src/airflow_breeze/global_constants.py b/dev/breeze/src/airflow_breeze/global_constants.py index c9a9066f07..d79a6561d2 100644 --- a/dev/breeze/src/airflow_breeze/global_constants.py +++ b/dev/breeze/src/airflow_breeze/global_constants.py @@ -305,13 +305,13 @@ FILES_FOR_REBUILD_CHECK = [ "setup.cfg", "Dockerfile.ci", ".dockerignore", + "generated/provider_dependencies.json", "scripts/docker/common.sh", "scripts/docker/install_additional_dependencies.sh", "scripts/docker/install_airflow.sh", "scripts/docker/install_airflow_dependencies_from_branch_tip.sh", "scripts/docker/install_from_docker_context_files.sh", "scripts/docker/install_mysql.sh", - *ALL_PROVIDER_YAML_FILES, ] ENABLED_SYSTEMS = "" diff --git a/dev/breeze/src/airflow_breeze/params/build_ci_params.py b/dev/breeze/src/airflow_breeze/params/build_ci_params.py index 8888a7398f..4ad0b82789 100644 --- a/dev/breeze/src/airflow_breeze/params/build_ci_params.py +++ b/dev/breeze/src/airflow_breeze/params/build_ci_params.py @@ -37,6 +37,7 @@ class BuildCiParams(CommonBuildParams): airflow_pre_cached_pip_packages: bool = True force_build: bool = False eager_upgrade_additional_requirements: str = "" + skip_provider_dependencies_check: bool = False @property def airflow_version(self): diff --git a/dev/breeze/src/airflow_breeze/params/shell_params.py b/dev/breeze/src/airflow_breeze/params/shell_params.py index c8dfd9cd84..405fc7bcfb 100644 --- a/dev/breeze/src/airflow_breeze/params/shell_params.py +++ b/dev/breeze/src/airflow_breeze/params/shell_params.py @@ -125,6 +125,7 @@ class ShellParams: celery_flower: bool = False only_min_version_update: bool = False regenerate_missing_docs: bool = False + skip_provider_dependencies_check: bool = False def clone_with_test(self, test_type: str) -> ShellParams: new_params = deepcopy(self) diff --git a/dev/breeze/src/airflow_breeze/utils/md5_build_check.py b/dev/breeze/src/airflow_breeze/utils/md5_build_check.py index 4397fece4d..54b46c9916 100644 --- a/dev/breeze/src/airflow_breeze/utils/md5_build_check.py +++ b/dev/breeze/src/airflow_breeze/utils/md5_build_check.py @@ -20,11 +20,14 @@ Utilities to check - with MD5 - whether files have been modified since the last from __future__ import annotations import hashlib +import os +import sys from pathlib import Path -from airflow_breeze.global_constants import FILES_FOR_REBUILD_CHECK +from airflow_breeze.global_constants import ALL_PROVIDER_YAML_FILES, FILES_FOR_REBUILD_CHECK from airflow_breeze.utils.console import get_console from airflow_breeze.utils.path_utils import AIRFLOW_SOURCES_ROOT +from airflow_breeze.utils.run_utils import run_command def check_md5checksum_in_cache_modified(file_hash: str, cache_path: Path, update: bool) -> bool: @@ -59,8 +62,19 @@ def generate_md5(filename, file_size: int = 65536): return hash_md5.hexdigest() +def check_md5_sum_for_file(file_to_check: str, md5sum_cache_dir: Path, update: bool): + file_to_get_md5 = AIRFLOW_SOURCES_ROOT / file_to_check + md5_checksum = generate_md5(file_to_get_md5) + sub_dir_name = file_to_get_md5.parts[-2] + actual_file_name = file_to_get_md5.parts[-1] + cache_file_name = Path(md5sum_cache_dir, sub_dir_name + "-" + actual_file_name + ".md5sum") + file_content = md5_checksum + " " + str(file_to_get_md5) + "\n" + is_modified = check_md5checksum_in_cache_modified(file_content, cache_file_name, update=update) + return is_modified + + def calculate_md5_checksum_for_files( - md5sum_cache_dir: Path, update: bool = False + md5sum_cache_dir: Path, update: bool = False, skip_provider_dependencies_check: bool = False ) -> tuple[list[str], list[str]]: """ Calculates checksums for all interesting files and stores the hashes in the md5sum_cache_dir. @@ -68,36 +82,64 @@ def calculate_md5_checksum_for_files( :param md5sum_cache_dir: directory where to store cached information :param update: whether to update the hashes + :param skip_provider_dependencies_check: whether to skip regeneration of the provider dependencies :return: Tuple of two lists: modified and not-modified files """ not_modified_files = [] modified_files = [] - for calculate_md5_file in FILES_FOR_REBUILD_CHECK: - file_to_get_md5 = AIRFLOW_SOURCES_ROOT / calculate_md5_file - md5_checksum = generate_md5(file_to_get_md5) - sub_dir_name = file_to_get_md5.parts[-2] - actual_file_name = file_to_get_md5.parts[-1] - cache_file_name = Path(md5sum_cache_dir, sub_dir_name + "-" + actual_file_name + ".md5sum") - file_content = md5_checksum + " " + str(file_to_get_md5) + "\n" - is_modified = check_md5checksum_in_cache_modified(file_content, cache_file_name, update=update) + if not skip_provider_dependencies_check: + modified_provider_yaml_files = [] + for file in ALL_PROVIDER_YAML_FILES: + # Only check provider yaml files once and save the result immediately. + # If we need to regenerate the dependencies and they are not modified then + # all is fine and we can save checksums for the new files + if check_md5_sum_for_file(file, md5sum_cache_dir, True): + modified_provider_yaml_files.append(file) + if modified_provider_yaml_files: + get_console().print( + "[info]Attempting to generate provider dependencies. " + "Provider yaml files changed since last check:[/]" + ) + get_console().print( + [os.fspath(file.relative_to(AIRFLOW_SOURCES_ROOT)) for file in modified_provider_yaml_files] + ) + # Regenerate provider_dependencies.json + run_command( + [ + sys.executable, + os.fspath( + AIRFLOW_SOURCES_ROOT + / "scripts" + / "ci" + / "pre_commit" + / "pre_commit_update_providers_dependencies.py" + ), + ], + cwd=AIRFLOW_SOURCES_ROOT, + ) + for file in FILES_FOR_REBUILD_CHECK: + is_modified = check_md5_sum_for_file(file, md5sum_cache_dir, update) if is_modified: - modified_files.append(calculate_md5_file) + modified_files.append(file) else: - not_modified_files.append(calculate_md5_file) + not_modified_files.append(file) return modified_files, not_modified_files -def md5sum_check_if_build_is_needed(md5sum_cache_dir: Path) -> bool: +def md5sum_check_if_build_is_needed(md5sum_cache_dir: Path, skip_provider_dependencies_check: bool) -> bool: """ Checks if build is needed based on whether important files were modified. :param md5sum_cache_dir: directory where cached md5 sums are stored + :param skip_provider_dependencies_check: whether to skip regeneration of the provider dependencies :return: True if build is needed. """ build_needed = False - modified_files, not_modified_files = calculate_md5_checksum_for_files(md5sum_cache_dir, update=False) - if len(modified_files) > 0: + modified_files, not_modified_files = calculate_md5_checksum_for_files( + md5sum_cache_dir, update=False, skip_provider_dependencies_check=skip_provider_dependencies_check + ) + if modified_files: get_console().print( f"[warning]The following important files are modified in {AIRFLOW_SOURCES_ROOT} " f"since last time image was built: [/]\n\n" diff --git a/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py b/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py index 0c489bad63..c8ea48ec48 100755 --- a/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py +++ b/scripts/ci/pre_commit/pre_commit_update_providers_dependencies.py @@ -209,12 +209,28 @@ if __name__ == "__main__": console.print("[red]Errors found during verification. Exiting!") console.print() sys.exit(1) - DEPENDENCIES_JSON_FILE_PATH.write_text(json.dumps(unique_sorted_dependencies, indent=2) + "\n") - console.print( - f"[yellow]If you see changes to the {DEPENDENCIES_JSON_FILE_PATH} file - " - f"do not modify the file manually. Let pre-commit do the job!" - ) - console.print() - console.print("[green]Verification complete! Success!\n") - console.print(f"Written {DEPENDENCIES_JSON_FILE_PATH}") - console.print() + old_dependencies = DEPENDENCIES_JSON_FILE_PATH.read_text() + new_dependencies = json.dumps(unique_sorted_dependencies, indent=2) + "\n" + if new_dependencies != old_dependencies: + DEPENDENCIES_JSON_FILE_PATH.write_text(json.dumps(unique_sorted_dependencies, indent=2) + "\n") + if os.environ.get("CI"): + console.print() + console.print(f"[info]Written {DEPENDENCIES_JSON_FILE_PATH}") + console.print( + f"[yellow]You will need to run breeze locally and commit " + f"{DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)}!\n" + ) + console.print() + else: + console.print() + console.print( + f"[yellow]Regenerated new dependencies. Please commit " + f"{DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)}!\n" + ) + console.print(f"[info]Written {DEPENDENCIES_JSON_FILE_PATH}") + console.print() + else: + console.print( + "[green]No need to regenerate dependencies!\n[/]" + f"The {DEPENDENCIES_JSON_FILE_PATH.relative_to(AIRFLOW_SOURCES_ROOT)} is up to date!\n" + )
