This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git


The following commit(s) were added to refs/heads/main by this push:
     new 2d083cb208 Fix an edge case when parallel spell-checking fails (#36591)
2d083cb208 is described below

commit 2d083cb2082a5b8e150f7b580009da46aa3cff7b
Author: Jarek Potiuk <[email protected]>
AuthorDate: Thu Jan 4 14:11:32 2024 +0100

    Fix an edge case when parallel spell-checking fails (#36591)
    
    * Fix an edge case when parallel spell-checking fails
    
    There was an edge case where parallel spell-check only could have
    failed because of interaction between packages that were built in
    parallel.
    
    Our documentation building is parallelized so that packages can
    be built in parallel, however there is an extra complexity added
    when different packages are built together in parallel and when those
    packages are referring to each other.
    
    The problem is when one package is using reference to another package
    that is built at the same time. In this case the first package will
    fail, because it will miss references to the other package.
    
    This problem has been solved for documentation building, by introducing
    multiple passess. If a package fails during the first wave, it is
    attempted to be rebuilt for the second time (without rebuilding the
    packages that succeeded) - this way at the second pass, there is no
    risk that those two pckages will be built in parallel.
    
    In fact we have up to 3(!) passes - in order to account for situation
    where there are three packages in chain built together. I.e if there
    is A->B->C dependency chain and A and B are built when C is being built
    both A and B will fail in the first pass, but when A and B will be built
    in the second pass, A will also fail (because it will expect B) so we
    have a third pass (and there only A will be rebuilt).
    
    This works fine for docs building, but when we do spellcheck-only, the
    situation is a bit different because we do not perform documentation
    build in the first place - so re-running spellchecking when we have
    some A-> B or A->B-C dependency, because neither B nor C will be rebuilt
    when we run spellcheck-only.
    
    This was attempted to be fixed in #36336 by switching to
    docs+spellcheck build when we detect that spellchecking fails because
    of dependency to another package. However the fix was not good enough -
    what the fix did was to attempt to rebuild all packages and continue
    rebuilding all packages in 2nd and 3rd pass - which had the unfortunate
    side-effect that if the 2nd pass failed as well, the 3rd pass cleaned
    everything and repeated what the 2nd pass did before.
    
    This PR fixes it properly:
    
    * In case first pass with spellcheck-only fails because of package
      dependency, the 2nd pass will rebuild all originally built packages
      both build + spellcheck
    
    * However in this case the 3rd pass will only rebuild packages that
      failed in the 2nd pass (not all originally built packages as it
      was implemented in #36336)
    
    * Also in case we do spellcheck-only, we introduce the 4th(!) pass
      to account for the same case A->B-C that the third pass addresses
      in the "build docs" case
    
    This PR adds typing, reviews return values from the methods implementing
    the logic and adds diagnostic logs and comments explaining the logic in
    an attempt to make it more readable and see what's going on.
    
    * Update docs/build_docs.py
    
    * Update docs/build_docs.py
---
 docs/build_docs.py | 209 +++++++++++++++++++++++++++++------------------------
 1 file changed, 115 insertions(+), 94 deletions(-)

diff --git a/docs/build_docs.py b/docs/build_docs.py
index 6eac628b96..49f1708750 100755
--- a/docs/build_docs.py
+++ b/docs/build_docs.py
@@ -28,7 +28,7 @@ import multiprocessing
 import os
 import sys
 from collections import defaultdict
-from typing import Callable, Iterable, NamedTuple, TypeVar
+from typing import Any, Callable, Iterable, NamedTuple, TypeVar
 
 from rich.console import Console
 from tabulate import tabulate
@@ -219,7 +219,7 @@ def 
perform_spell_check_for_single_package(build_specification: BuildSpecificati
 
 
 def build_docs_for_packages(
-    current_packages: list[str],
+    packages_to_build: list[str],
     docs_only: bool,
     spellcheck_only: bool,
     jobs: int,
@@ -229,28 +229,28 @@ def build_docs_for_packages(
     all_build_errors: dict[str, list[DocBuildError]] = defaultdict(list)
     all_spelling_errors: dict[str, list[SpellingError]] = defaultdict(list)
     with with_group("Cleaning documentation files"):
-        for package_name in current_packages:
+        for package_name in packages_to_build:
             console.print(f"[info]{package_name:60}:[/] Cleaning files")
             builder = AirflowDocsBuilder(package_name=package_name)
             builder.clean_files()
     if jobs > 1:
         run_in_parallel(
-            all_build_errors,
-            all_spelling_errors,
-            current_packages,
-            docs_only,
-            jobs,
-            spellcheck_only,
-            verbose,
+            all_build_errors=all_build_errors,
+            all_spelling_errors=all_spelling_errors,
+            packages_to_build=packages_to_build,
+            docs_only=docs_only,
+            jobs=jobs,
+            spellcheck_only=spellcheck_only,
+            verbose=verbose,
         )
     else:
         run_sequentially(
-            all_build_errors,
-            all_spelling_errors,
-            current_packages,
-            docs_only,
-            spellcheck_only,
-            verbose,
+            all_build_errors=all_build_errors,
+            all_spelling_errors=all_spelling_errors,
+            packages_to_build=packages_to_build,
+            docs_only=docs_only,
+            spellcheck_only=spellcheck_only,
+            verbose=verbose,
         )
     return all_build_errors, all_spelling_errors
 
@@ -258,14 +258,14 @@ def build_docs_for_packages(
 def run_sequentially(
     all_build_errors,
     all_spelling_errors,
-    current_packages,
+    packages_to_build,
     docs_only,
     spellcheck_only,
     verbose,
 ):
     """Run both - spellcheck and docs build sequentially without 
multiprocessing"""
     if not spellcheck_only:
-        for package_name in current_packages:
+        for package_name in packages_to_build:
             build_result = perform_docs_build_for_single_package(
                 build_specification=BuildSpecification(
                     package_name=package_name,
@@ -276,7 +276,7 @@ def run_sequentially(
                 all_build_errors[package_name].extend(build_result.errors)
                 print_build_output(build_result)
     if not docs_only:
-        for package_name in current_packages:
+        for package_name in packages_to_build:
             spellcheck_result = perform_spell_check_for_single_package(
                 build_specification=BuildSpecification(
                     package_name=package_name,
@@ -291,20 +291,20 @@ def run_sequentially(
 
 
 def run_in_parallel(
-    all_build_errors,
-    all_spelling_errors,
-    current_packages,
-    docs_only,
-    jobs,
-    spellcheck_only,
-    verbose,
+    all_build_errors: dict[str, list[DocBuildError]],
+    all_spelling_errors: dict[str, list[SpellingError]],
+    packages_to_build: list[str],
+    docs_only: bool,
+    jobs: int,
+    spellcheck_only: bool,
+    verbose: bool,
 ):
     """Run both - spellcheck and docs build sequentially without 
multiprocessing"""
     with multiprocessing.Pool(processes=jobs) as pool:
         if not spellcheck_only:
             run_docs_build_in_parallel(
                 all_build_errors=all_build_errors,
-                current_packages=current_packages,
+                packages_to_build=packages_to_build,
                 verbose=verbose,
                 pool=pool,
             )
@@ -312,7 +312,7 @@ def run_in_parallel(
             run_spell_check_in_parallel(
                 all_spelling_errors=all_spelling_errors,
                 all_build_errors=all_build_errors,
-                current_packages=current_packages,
+                packages_to_build=packages_to_build,
                 verbose=verbose,
                 pool=pool,
             )
@@ -331,14 +331,14 @@ def print_build_output(result: BuildDocsResult):
 
 def run_docs_build_in_parallel(
     all_build_errors: dict[str, list[DocBuildError]],
-    current_packages: list[str],
+    packages_to_build: list[str],
     verbose: bool,
-    pool,
+    pool: Any,  # Cannot use multiprocessing types here: 
https://github.com/python/typeshed/issues/4266
 ):
     """Runs documentation building in parallel."""
     doc_build_specifications: list[BuildSpecification] = []
     with with_group("Scheduling documentation to build"):
-        for package_name in current_packages:
+        for package_name in packages_to_build:
             console.print(f"[info]{package_name:60}:[/] Scheduling 
documentation to build")
             doc_build_specifications.append(
                 BuildSpecification(
@@ -370,14 +370,14 @@ def print_spelling_output(result: SpellCheckResult):
 def run_spell_check_in_parallel(
     all_spelling_errors: dict[str, list[SpellingError]],
     all_build_errors: dict[str, list[DocBuildError]],
-    current_packages: list[str],
+    packages_to_build: list[str],
     verbose: bool,
     pool,
 ):
     """Runs spell check in parallel."""
     spell_check_specifications: list[BuildSpecification] = []
     with with_group("Scheduling spell checking of documentation"):
-        for package_name in current_packages:
+        for package_name in packages_to_build:
             console.print(f"[info]{package_name:60}:[/] Scheduling 
spellchecking")
             
spell_check_specifications.append(BuildSpecification(package_name=package_name, 
verbose=verbose))
     with with_group("Running spell checking of documentation"):
@@ -444,21 +444,21 @@ def main():
 
     if package_filters:
         console.print("Current package filters: ", package_filters)
-    current_packages = process_package_filters(available_packages, 
package_filters)
+    packages_to_build = process_package_filters(available_packages, 
package_filters)
     with with_group("Fetching inventories"):
         # Inventories that could not be retrieved should be built first. This 
may mean this is a
         # new package.
         packages_without_inventories = fetch_inventories()
     normal_packages, priority_packages = partition(
-        lambda d: d in packages_without_inventories, current_packages
+        lambda d: d in packages_without_inventories, packages_to_build
     )
     normal_packages, priority_packages = list(normal_packages), 
list(priority_packages)
     jobs = args.jobs if args.jobs != 0 else os.cpu_count()
 
     with with_group(
-        f"Documentation will be built for {len(current_packages)} package(s) 
with {jobs} parallel jobs"
+        f"Documentation will be built for {len(packages_to_build)} package(s) 
with {jobs} parallel jobs"
     ):
-        for pkg_no, pkg in enumerate(current_packages, start=1):
+        for pkg_no, pkg in enumerate(packages_to_build, start=1):
             console.print(f"{pkg_no}. {pkg}")
 
     all_build_errors: dict[str | None, list[DocBuildError]] = {}
@@ -466,7 +466,7 @@ def main():
     if priority_packages:
         # Build priority packages
         package_build_errors, package_spelling_errors = 
build_docs_for_packages(
-            current_packages=priority_packages,
+            packages_to_build=priority_packages,
             docs_only=docs_only,
             spellcheck_only=spellcheck_only,
             jobs=jobs,
@@ -482,7 +482,7 @@ def main():
     # two or more inventories, it is better to try to build for all packages 
as the previous packages
     # may have failed as well.
     package_build_errors, package_spelling_errors = build_docs_for_packages(
-        current_packages=current_packages if len(priority_packages) > 1 else 
normal_packages,
+        packages_to_build=packages_to_build if len(priority_packages) > 1 else 
normal_packages,
         docs_only=docs_only,
         spellcheck_only=spellcheck_only,
         jobs=jobs,
@@ -495,31 +495,50 @@ def main():
 
     if not args.one_pass_only:
         # Build documentation for some packages again if it can help them.
-        package_build_errors, package_spelling_errors = 
retry_building_docs_if_needed(
-            all_build_errors,
-            all_spelling_errors,
-            args,
-            docs_only,
-            jobs,
-            package_build_errors,
-            package_spelling_errors,
-            current_packages,
-            spellcheck_only,
+        package_build_errors = retry_building_docs_if_needed(
+            all_build_errors=all_build_errors,
+            all_spelling_errors=all_spelling_errors,
+            args=args,
+            docs_only=docs_only,
+            jobs=jobs,
+            package_build_errors=package_build_errors,
+            originally_built_packages=packages_to_build,
+            # If spellchecking fails, we need to rebuild all packages first in 
case some references
+            # are broken between packages
+            rebuild_all_packages=spellcheck_only,
         )
 
-        # And try again in case one change spans across three-level 
dependencies
-        retry_building_docs_if_needed(
-            all_build_errors,
-            all_spelling_errors,
-            args,
-            docs_only,
-            jobs,
-            package_build_errors,
-            package_spelling_errors,
-            current_packages,
-            spellcheck_only,
+        # And try again in case one change spans across three-level 
dependencies.
+        package_build_errors = retry_building_docs_if_needed(
+            all_build_errors=all_build_errors,
+            all_spelling_errors=all_spelling_errors,
+            args=args,
+            docs_only=docs_only,
+            jobs=jobs,
+            package_build_errors=package_build_errors,
+            originally_built_packages=packages_to_build,
+            # In the 3rd pass we only rebuild packages that failed in the 2nd 
pass
+            # no matter if we do spellcheck-only build
+            rebuild_all_packages=False,
         )
 
+        if spellcheck_only:
+            # And in case of spellcheck-only, we add a 4th pass to account for 
A->B-C case
+            # For spellcheck-only build, the first pass does not solve any of 
the dependency
+            # Issues, they only start getting solved and the 2nd pass so we 
might need to do one more pass
+            package_build_errors = retry_building_docs_if_needed(
+                all_build_errors=all_build_errors,
+                all_spelling_errors=all_spelling_errors,
+                args=args,
+                docs_only=docs_only,
+                jobs=jobs,
+                package_build_errors=package_build_errors,
+                originally_built_packages=packages_to_build,
+                # In the 4th pass we only rebuild packages that failed in the 
3rd pass
+                # no matter if we do spellcheck-only build
+                rebuild_all_packages=False,
+            )
+
     if not disable_checks:
         general_errors = 
lint_checks.run_all_check(disable_provider_checks=disable_provider_checks)
         if general_errors:
@@ -540,45 +559,47 @@ def main():
 
 
 def retry_building_docs_if_needed(
-    all_build_errors,
-    all_spelling_errors,
-    args,
-    docs_only,
-    jobs,
-    package_build_errors,
-    package_spelling_errors,
-    current_packages,
-    spellcheck_only,
-):
+    all_build_errors: dict[str, list[DocBuildError]],
+    all_spelling_errors: dict[str, list[SpellingError]],
+    args: argparse.Namespace,
+    docs_only: bool,
+    jobs: int,
+    package_build_errors: dict[str, list[DocBuildError]],
+    originally_built_packages: list[str],
+    rebuild_all_packages: bool,
+) -> dict[str, list[DocBuildError]]:
     to_retry_packages = [
         package_name
         for package_name, errors in package_build_errors.items()
         if any(any((m in e.message) for m in ERRORS_ELIGIBLE_TO_REBUILD) for e 
in errors)
     ]
-    if to_retry_packages and spellcheck_only:
-        # in case spellchecking fails with retry all packages should be 
rebuilt without spell-checking
-        # in case the failed package refers to another package
-        to_retry_packages = current_packages
-    if to_retry_packages:
-        for package_name in to_retry_packages:
-            if package_name in all_build_errors:
-                del all_build_errors[package_name]
-            if package_name in all_spelling_errors:
-                del all_spelling_errors[package_name]
-
-        package_build_errors, package_spelling_errors = 
build_docs_for_packages(
-            current_packages=to_retry_packages,
-            docs_only=docs_only,
-            spellcheck_only=False,
-            jobs=jobs,
-            verbose=args.verbose,
-        )
-        if package_build_errors:
-            all_build_errors.update(package_build_errors)
-        if package_spelling_errors:
-            all_spelling_errors.update(package_spelling_errors)
-        return package_build_errors, package_spelling_errors
-    return package_build_errors, package_spelling_errors
+    if not to_retry_packages:
+        console.print("[green]No packages to retry. No more passes are 
needed.[/]")
+        return package_build_errors
+    console.print("[warning] Some packages failed to build due to 
dependencies. We need another pass.[/]")
+    # if we are rebuilding all packages, we need to retry all packages
+    # even if there is one package to rebuild only
+    if rebuild_all_packages:
+        console.print("[warning]Rebuilding all originally built package as 
this is the first build pass:[/]")
+        to_retry_packages = originally_built_packages
+    console.print(f"[bright_blue]Packages to rebuild: {to_retry_packages}[/]")
+    for package_name in to_retry_packages:
+        if package_name in all_build_errors:
+            del all_build_errors[package_name]
+        if package_name in all_spelling_errors:
+            del all_spelling_errors[package_name]
+    package_build_errors, package_spelling_errors = build_docs_for_packages(
+        packages_to_build=to_retry_packages,
+        docs_only=docs_only,
+        spellcheck_only=False,
+        jobs=jobs,
+        verbose=args.verbose,
+    )
+    if package_build_errors:
+        all_build_errors.update(package_build_errors)
+    if package_spelling_errors:
+        all_spelling_errors.update(package_spelling_errors)
+    return package_build_errors
 
 
 if __name__ == "__main__":

Reply via email to