This is an automated email from the ASF dual-hosted git repository.
potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new 2d083cb208 Fix an edge case when parallel spell-checking fails (#36591)
2d083cb208 is described below
commit 2d083cb2082a5b8e150f7b580009da46aa3cff7b
Author: Jarek Potiuk <[email protected]>
AuthorDate: Thu Jan 4 14:11:32 2024 +0100
Fix an edge case when parallel spell-checking fails (#36591)
* Fix an edge case when parallel spell-checking fails
There was an edge case where parallel spell-check only could have
failed because of interaction between packages that were built in
parallel.
Our documentation building is parallelized so that packages can
be built in parallel, however there is an extra complexity added
when different packages are built together in parallel and when those
packages are referring to each other.
The problem is when one package is using reference to another package
that is built at the same time. In this case the first package will
fail, because it will miss references to the other package.
This problem has been solved for documentation building, by introducing
multiple passess. If a package fails during the first wave, it is
attempted to be rebuilt for the second time (without rebuilding the
packages that succeeded) - this way at the second pass, there is no
risk that those two pckages will be built in parallel.
In fact we have up to 3(!) passes - in order to account for situation
where there are three packages in chain built together. I.e if there
is A->B->C dependency chain and A and B are built when C is being built
both A and B will fail in the first pass, but when A and B will be built
in the second pass, A will also fail (because it will expect B) so we
have a third pass (and there only A will be rebuilt).
This works fine for docs building, but when we do spellcheck-only, the
situation is a bit different because we do not perform documentation
build in the first place - so re-running spellchecking when we have
some A-> B or A->B-C dependency, because neither B nor C will be rebuilt
when we run spellcheck-only.
This was attempted to be fixed in #36336 by switching to
docs+spellcheck build when we detect that spellchecking fails because
of dependency to another package. However the fix was not good enough -
what the fix did was to attempt to rebuild all packages and continue
rebuilding all packages in 2nd and 3rd pass - which had the unfortunate
side-effect that if the 2nd pass failed as well, the 3rd pass cleaned
everything and repeated what the 2nd pass did before.
This PR fixes it properly:
* In case first pass with spellcheck-only fails because of package
dependency, the 2nd pass will rebuild all originally built packages
both build + spellcheck
* However in this case the 3rd pass will only rebuild packages that
failed in the 2nd pass (not all originally built packages as it
was implemented in #36336)
* Also in case we do spellcheck-only, we introduce the 4th(!) pass
to account for the same case A->B-C that the third pass addresses
in the "build docs" case
This PR adds typing, reviews return values from the methods implementing
the logic and adds diagnostic logs and comments explaining the logic in
an attempt to make it more readable and see what's going on.
* Update docs/build_docs.py
* Update docs/build_docs.py
---
docs/build_docs.py | 209 +++++++++++++++++++++++++++++------------------------
1 file changed, 115 insertions(+), 94 deletions(-)
diff --git a/docs/build_docs.py b/docs/build_docs.py
index 6eac628b96..49f1708750 100755
--- a/docs/build_docs.py
+++ b/docs/build_docs.py
@@ -28,7 +28,7 @@ import multiprocessing
import os
import sys
from collections import defaultdict
-from typing import Callable, Iterable, NamedTuple, TypeVar
+from typing import Any, Callable, Iterable, NamedTuple, TypeVar
from rich.console import Console
from tabulate import tabulate
@@ -219,7 +219,7 @@ def
perform_spell_check_for_single_package(build_specification: BuildSpecificati
def build_docs_for_packages(
- current_packages: list[str],
+ packages_to_build: list[str],
docs_only: bool,
spellcheck_only: bool,
jobs: int,
@@ -229,28 +229,28 @@ def build_docs_for_packages(
all_build_errors: dict[str, list[DocBuildError]] = defaultdict(list)
all_spelling_errors: dict[str, list[SpellingError]] = defaultdict(list)
with with_group("Cleaning documentation files"):
- for package_name in current_packages:
+ for package_name in packages_to_build:
console.print(f"[info]{package_name:60}:[/] Cleaning files")
builder = AirflowDocsBuilder(package_name=package_name)
builder.clean_files()
if jobs > 1:
run_in_parallel(
- all_build_errors,
- all_spelling_errors,
- current_packages,
- docs_only,
- jobs,
- spellcheck_only,
- verbose,
+ all_build_errors=all_build_errors,
+ all_spelling_errors=all_spelling_errors,
+ packages_to_build=packages_to_build,
+ docs_only=docs_only,
+ jobs=jobs,
+ spellcheck_only=spellcheck_only,
+ verbose=verbose,
)
else:
run_sequentially(
- all_build_errors,
- all_spelling_errors,
- current_packages,
- docs_only,
- spellcheck_only,
- verbose,
+ all_build_errors=all_build_errors,
+ all_spelling_errors=all_spelling_errors,
+ packages_to_build=packages_to_build,
+ docs_only=docs_only,
+ spellcheck_only=spellcheck_only,
+ verbose=verbose,
)
return all_build_errors, all_spelling_errors
@@ -258,14 +258,14 @@ def build_docs_for_packages(
def run_sequentially(
all_build_errors,
all_spelling_errors,
- current_packages,
+ packages_to_build,
docs_only,
spellcheck_only,
verbose,
):
"""Run both - spellcheck and docs build sequentially without
multiprocessing"""
if not spellcheck_only:
- for package_name in current_packages:
+ for package_name in packages_to_build:
build_result = perform_docs_build_for_single_package(
build_specification=BuildSpecification(
package_name=package_name,
@@ -276,7 +276,7 @@ def run_sequentially(
all_build_errors[package_name].extend(build_result.errors)
print_build_output(build_result)
if not docs_only:
- for package_name in current_packages:
+ for package_name in packages_to_build:
spellcheck_result = perform_spell_check_for_single_package(
build_specification=BuildSpecification(
package_name=package_name,
@@ -291,20 +291,20 @@ def run_sequentially(
def run_in_parallel(
- all_build_errors,
- all_spelling_errors,
- current_packages,
- docs_only,
- jobs,
- spellcheck_only,
- verbose,
+ all_build_errors: dict[str, list[DocBuildError]],
+ all_spelling_errors: dict[str, list[SpellingError]],
+ packages_to_build: list[str],
+ docs_only: bool,
+ jobs: int,
+ spellcheck_only: bool,
+ verbose: bool,
):
"""Run both - spellcheck and docs build sequentially without
multiprocessing"""
with multiprocessing.Pool(processes=jobs) as pool:
if not spellcheck_only:
run_docs_build_in_parallel(
all_build_errors=all_build_errors,
- current_packages=current_packages,
+ packages_to_build=packages_to_build,
verbose=verbose,
pool=pool,
)
@@ -312,7 +312,7 @@ def run_in_parallel(
run_spell_check_in_parallel(
all_spelling_errors=all_spelling_errors,
all_build_errors=all_build_errors,
- current_packages=current_packages,
+ packages_to_build=packages_to_build,
verbose=verbose,
pool=pool,
)
@@ -331,14 +331,14 @@ def print_build_output(result: BuildDocsResult):
def run_docs_build_in_parallel(
all_build_errors: dict[str, list[DocBuildError]],
- current_packages: list[str],
+ packages_to_build: list[str],
verbose: bool,
- pool,
+ pool: Any, # Cannot use multiprocessing types here:
https://github.com/python/typeshed/issues/4266
):
"""Runs documentation building in parallel."""
doc_build_specifications: list[BuildSpecification] = []
with with_group("Scheduling documentation to build"):
- for package_name in current_packages:
+ for package_name in packages_to_build:
console.print(f"[info]{package_name:60}:[/] Scheduling
documentation to build")
doc_build_specifications.append(
BuildSpecification(
@@ -370,14 +370,14 @@ def print_spelling_output(result: SpellCheckResult):
def run_spell_check_in_parallel(
all_spelling_errors: dict[str, list[SpellingError]],
all_build_errors: dict[str, list[DocBuildError]],
- current_packages: list[str],
+ packages_to_build: list[str],
verbose: bool,
pool,
):
"""Runs spell check in parallel."""
spell_check_specifications: list[BuildSpecification] = []
with with_group("Scheduling spell checking of documentation"):
- for package_name in current_packages:
+ for package_name in packages_to_build:
console.print(f"[info]{package_name:60}:[/] Scheduling
spellchecking")
spell_check_specifications.append(BuildSpecification(package_name=package_name,
verbose=verbose))
with with_group("Running spell checking of documentation"):
@@ -444,21 +444,21 @@ def main():
if package_filters:
console.print("Current package filters: ", package_filters)
- current_packages = process_package_filters(available_packages,
package_filters)
+ packages_to_build = process_package_filters(available_packages,
package_filters)
with with_group("Fetching inventories"):
# Inventories that could not be retrieved should be built first. This
may mean this is a
# new package.
packages_without_inventories = fetch_inventories()
normal_packages, priority_packages = partition(
- lambda d: d in packages_without_inventories, current_packages
+ lambda d: d in packages_without_inventories, packages_to_build
)
normal_packages, priority_packages = list(normal_packages),
list(priority_packages)
jobs = args.jobs if args.jobs != 0 else os.cpu_count()
with with_group(
- f"Documentation will be built for {len(current_packages)} package(s)
with {jobs} parallel jobs"
+ f"Documentation will be built for {len(packages_to_build)} package(s)
with {jobs} parallel jobs"
):
- for pkg_no, pkg in enumerate(current_packages, start=1):
+ for pkg_no, pkg in enumerate(packages_to_build, start=1):
console.print(f"{pkg_no}. {pkg}")
all_build_errors: dict[str | None, list[DocBuildError]] = {}
@@ -466,7 +466,7 @@ def main():
if priority_packages:
# Build priority packages
package_build_errors, package_spelling_errors =
build_docs_for_packages(
- current_packages=priority_packages,
+ packages_to_build=priority_packages,
docs_only=docs_only,
spellcheck_only=spellcheck_only,
jobs=jobs,
@@ -482,7 +482,7 @@ def main():
# two or more inventories, it is better to try to build for all packages
as the previous packages
# may have failed as well.
package_build_errors, package_spelling_errors = build_docs_for_packages(
- current_packages=current_packages if len(priority_packages) > 1 else
normal_packages,
+ packages_to_build=packages_to_build if len(priority_packages) > 1 else
normal_packages,
docs_only=docs_only,
spellcheck_only=spellcheck_only,
jobs=jobs,
@@ -495,31 +495,50 @@ def main():
if not args.one_pass_only:
# Build documentation for some packages again if it can help them.
- package_build_errors, package_spelling_errors =
retry_building_docs_if_needed(
- all_build_errors,
- all_spelling_errors,
- args,
- docs_only,
- jobs,
- package_build_errors,
- package_spelling_errors,
- current_packages,
- spellcheck_only,
+ package_build_errors = retry_building_docs_if_needed(
+ all_build_errors=all_build_errors,
+ all_spelling_errors=all_spelling_errors,
+ args=args,
+ docs_only=docs_only,
+ jobs=jobs,
+ package_build_errors=package_build_errors,
+ originally_built_packages=packages_to_build,
+ # If spellchecking fails, we need to rebuild all packages first in
case some references
+ # are broken between packages
+ rebuild_all_packages=spellcheck_only,
)
- # And try again in case one change spans across three-level
dependencies
- retry_building_docs_if_needed(
- all_build_errors,
- all_spelling_errors,
- args,
- docs_only,
- jobs,
- package_build_errors,
- package_spelling_errors,
- current_packages,
- spellcheck_only,
+ # And try again in case one change spans across three-level
dependencies.
+ package_build_errors = retry_building_docs_if_needed(
+ all_build_errors=all_build_errors,
+ all_spelling_errors=all_spelling_errors,
+ args=args,
+ docs_only=docs_only,
+ jobs=jobs,
+ package_build_errors=package_build_errors,
+ originally_built_packages=packages_to_build,
+ # In the 3rd pass we only rebuild packages that failed in the 2nd
pass
+ # no matter if we do spellcheck-only build
+ rebuild_all_packages=False,
)
+ if spellcheck_only:
+ # And in case of spellcheck-only, we add a 4th pass to account for
A->B-C case
+ # For spellcheck-only build, the first pass does not solve any of
the dependency
+ # Issues, they only start getting solved and the 2nd pass so we
might need to do one more pass
+ package_build_errors = retry_building_docs_if_needed(
+ all_build_errors=all_build_errors,
+ all_spelling_errors=all_spelling_errors,
+ args=args,
+ docs_only=docs_only,
+ jobs=jobs,
+ package_build_errors=package_build_errors,
+ originally_built_packages=packages_to_build,
+ # In the 4th pass we only rebuild packages that failed in the
3rd pass
+ # no matter if we do spellcheck-only build
+ rebuild_all_packages=False,
+ )
+
if not disable_checks:
general_errors =
lint_checks.run_all_check(disable_provider_checks=disable_provider_checks)
if general_errors:
@@ -540,45 +559,47 @@ def main():
def retry_building_docs_if_needed(
- all_build_errors,
- all_spelling_errors,
- args,
- docs_only,
- jobs,
- package_build_errors,
- package_spelling_errors,
- current_packages,
- spellcheck_only,
-):
+ all_build_errors: dict[str, list[DocBuildError]],
+ all_spelling_errors: dict[str, list[SpellingError]],
+ args: argparse.Namespace,
+ docs_only: bool,
+ jobs: int,
+ package_build_errors: dict[str, list[DocBuildError]],
+ originally_built_packages: list[str],
+ rebuild_all_packages: bool,
+) -> dict[str, list[DocBuildError]]:
to_retry_packages = [
package_name
for package_name, errors in package_build_errors.items()
if any(any((m in e.message) for m in ERRORS_ELIGIBLE_TO_REBUILD) for e
in errors)
]
- if to_retry_packages and spellcheck_only:
- # in case spellchecking fails with retry all packages should be
rebuilt without spell-checking
- # in case the failed package refers to another package
- to_retry_packages = current_packages
- if to_retry_packages:
- for package_name in to_retry_packages:
- if package_name in all_build_errors:
- del all_build_errors[package_name]
- if package_name in all_spelling_errors:
- del all_spelling_errors[package_name]
-
- package_build_errors, package_spelling_errors =
build_docs_for_packages(
- current_packages=to_retry_packages,
- docs_only=docs_only,
- spellcheck_only=False,
- jobs=jobs,
- verbose=args.verbose,
- )
- if package_build_errors:
- all_build_errors.update(package_build_errors)
- if package_spelling_errors:
- all_spelling_errors.update(package_spelling_errors)
- return package_build_errors, package_spelling_errors
- return package_build_errors, package_spelling_errors
+ if not to_retry_packages:
+ console.print("[green]No packages to retry. No more passes are
needed.[/]")
+ return package_build_errors
+ console.print("[warning] Some packages failed to build due to
dependencies. We need another pass.[/]")
+ # if we are rebuilding all packages, we need to retry all packages
+ # even if there is one package to rebuild only
+ if rebuild_all_packages:
+ console.print("[warning]Rebuilding all originally built package as
this is the first build pass:[/]")
+ to_retry_packages = originally_built_packages
+ console.print(f"[bright_blue]Packages to rebuild: {to_retry_packages}[/]")
+ for package_name in to_retry_packages:
+ if package_name in all_build_errors:
+ del all_build_errors[package_name]
+ if package_name in all_spelling_errors:
+ del all_spelling_errors[package_name]
+ package_build_errors, package_spelling_errors = build_docs_for_packages(
+ packages_to_build=to_retry_packages,
+ docs_only=docs_only,
+ spellcheck_only=False,
+ jobs=jobs,
+ verbose=args.verbose,
+ )
+ if package_build_errors:
+ all_build_errors.update(package_build_errors)
+ if package_spelling_errors:
+ all_spelling_errors.update(package_spelling_errors)
+ return package_build_errors
if __name__ == "__main__":