This is an automated email from the ASF dual-hosted git repository.
bugraoz93 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new 4325707b488 Add per-provider pre-extras install hook for native build
prereqs (#67205)
4325707b488 is described below
commit 4325707b488b88c5f88d20818a9e329fbd6d73af
Author: Bugra Ozturk <[email protected]>
AuthorDate: Thu May 21 20:15:01 2026 +0200
Add per-provider pre-extras install hook for native build prereqs (#67205)
* Add per-provider pre-extras install hook for native build prereqs
* Move logic to provider root
---
Dockerfile.ci | 29 ++++
contributing-docs/12_provider_distributions.rst | 72 ++++++++--
scripts/docker/entrypoint_ci.sh | 38 +++++
scripts/in_container/run_pre_extras_install.py | 184 ++++++++++++++++++++++++
4 files changed, 312 insertions(+), 11 deletions(-)
diff --git a/Dockerfile.ci b/Dockerfile.ci
index e7313ff1dfb..6da372e9a44 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -1478,6 +1478,34 @@ function reinstall_shared_distributions() {
uv pip install --no-deps $(ls -d /opt/airflow/shared/*/)
}
+PROVIDERS_NEEDING_PRE_EXTRAS_INSTALL=()
+
+function run_pre_extras_install_if_registered() {
+ local provider_id="${1}"
+ local registered_provider
+ for registered_provider in "${PROVIDERS_NEEDING_PRE_EXTRAS_INSTALL[@]}"; do
+ if [[ "${registered_provider}" == "${provider_id}" ]]; then
+ echo
+ echo "${COLOR_BLUE}Running pre-extras install manifest for
${provider_id}${COLOR_RESET}"
+ echo
+ local env_file
+ env_file=$(mktemp)
+ if ! python
"${AIRFLOW_SOURCES}/scripts/in_container/run_pre_extras_install.py" \
+ "${provider_id}" --emit-env-to "${env_file}"; then
+ rm -f "${env_file}"
+ echo "${COLOR_RED}Pre-extras install failed for
${provider_id}${COLOR_RESET}"
+ exit 1
+ fi
+ if [[ -s "${env_file}" ]]; then
+ # shellcheck disable=SC1090
+ source "${env_file}"
+ fi
+ rm -f "${env_file}"
+ return
+ fi
+ done
+}
+
function check_force_lowest_dependencies() {
if [[ ${FORCE_LOWEST_DEPENDENCIES=} != "true" ]]; then
return
@@ -1496,6 +1524,7 @@ function check_force_lowest_dependencies() {
exit 0
fi
cd "${AIRFLOW_SOURCES}/providers/${provider_id/.//}" || exit 1
+ run_pre_extras_install_if_registered "${provider_id}"
# --no-binary is needed in order to avoid libxml and xmlsec using
different version of libxml2
# (binary lxml embeds its own libxml2, while xmlsec uses system one).
# See https://bugs.launchpad.net/lxml/+bug/2110068
diff --git a/contributing-docs/12_provider_distributions.rst
b/contributing-docs/12_provider_distributions.rst
index 49a987cba21..8ae79f1074b 100644
--- a/contributing-docs/12_provider_distributions.rst
+++ b/contributing-docs/12_provider_distributions.rst
@@ -120,13 +120,11 @@ image for the new dependencies to be used in the Breeze
CI environment.
Non-default provider extras
---------------------------
-Some providers depend on packages that cannot be installed in CI by default —
for example a
-proprietary client library (IBM MQ's ``ibmmq``) or a native library that
requires system packages
-(Google's ``leveldb``/``plyvel`` needs ``libleveldb-dev``). Pulling these into
the default
-``uv sync`` would break CI on every runner that doesn't have the prerequisite
installed.
+Some providers depend on packages that cannot be installed in CI by default.
There are two
+distinct shapes of this problem, and they need different solutions:
-For these cases, declare the dependency as an extra on the provider and
register it as its own
-group at the root, without adding it to ``dev``:
+**Shape A: the Python package builds fine but needs a system library available
via apt**
+(e.g. Google's ``leveldb``/``plyvel`` needs ``libleveldb-dev``). Use the
``ci-image`` group:
1. **In the provider's ``pyproject.toml``** — keep the package under
``[project.optional-dependencies]``
so users can opt in with ``pip install
apache-airflow-providers-<id>[<extra>]``.
@@ -152,13 +150,65 @@ group at the root, without adding it to ``dev``:
{include-group = "my-non-default-extra"},
]
-3. **If system libraries are required**, add them to
``scripts/docker/install_os_dependencies.sh``
- so the CI image has the prerequisites before ``uv sync`` runs.
+3. **Add the apt prerequisite** to
``scripts/docker/install_os_dependencies.sh`` so the CI image
+ has the system library before ``uv sync`` runs.
Because the new group is *not* part of ``dev``, a plain ``uv sync`` on a
contributor's machine
-will not try to install it. The CI image installs it via ``ci-image``;
provider unit tests that
-import the proprietary or hard-to-build module should mock it (see
``providers/google/leveldb``
-for the established pattern).
+will not try to install it. The CI image installs it via ``ci-image``.
+
+**Shape B: the Python package's wheel build needs a proprietary SDK that
cannot live in the
+base CI image** (e.g. IBM MQ's ``ibmmq`` needs the IBM MQ Redistributable
Client headers and
+``MQ_FILE_PATH`` set at build time). Bundling such SDKs in the base image has
licence,
+maintenance, and image-size costs that the project does not want to pay on
every CI run.
+The lowest-direct-dependency provider tests still call ``uv sync
--all-extras`` inside the
+provider directory, so simply mocking the package in tests is not enough — the
sync itself
+must succeed first. Use the per-provider pre-extras-install manifest:
+
+1. **Keep the package under ``[project.optional-dependencies]``** on the
provider so end users
+ still install it with ``pip install
apache-airflow-providers-<id>[<extra>]``. **Do not add it
+ to the ``ci-image`` group** — the base image should not carry the SDK.
+
+2. **Register the provider** in ``PROVIDERS_NEEDING_PRE_EXTRAS_INSTALL`` in
+ ``scripts/docker/entrypoint_ci.sh``. The hook runs immediately before
+ ``check_force_lowest_dependencies`` calls ``uv sync --all-extras`` for that
provider. The
+ list is explicit on purpose so maintainers see when a new provider takes on
this shape of
+ cost, and so the surface for "drive-by privileged code" stays small.
+
+3. **Ship a declarative manifest** at
``providers/<id>/pre_extras_install.yaml``. The
+ manifest is *data, not code*: it is interpreted by
+ ``scripts/in_container/run_pre_extras_install.py``, which restricts what
providers can do
+ to pinned-checksum HTTPS downloads, archive extraction under ``/opt`` or
``/tmp``, and
+ env-var exports. Maintainers reviewing the manifest only need to verify
that the URL and
+ ``sha256`` belong to a trusted upstream — the provider cannot run arbitrary
shell, pipe
+ ``curl`` into ``bash``, exfiltrate secrets, or write outside the
allowlisted prefixes:
+
+ .. code:: yaml
+
+ # providers/ibm/mq/pre_extras_install.yaml
+ downloads:
+ - url:
https://public.dhe.ibm.com/.../9.4.0.0-IBM-MQC-Redist-LinuxX64.tar.gz
+ sha256: <64 lowercase hex chars>
+ extract_to: /opt/mqm
+ env:
+ MQ_FILE_PATH: /opt/mqm
+
+ Schema (all fields required where listed):
+
+ - ``downloads`` (optional list): each entry has ``url`` (must start with
``https://``),
+ ``sha256`` (64 lowercase hex chars), and ``extract_to`` (must start with
``/opt/`` or
+ ``/tmp/`` and may not contain ``..``). Supported archive formats are
``.tar``,
+ ``.tar.gz``/``.tgz`` and ``.zip``; the extractor refuses any entry whose
resolved path
+ escapes ``extract_to``.
+ - ``env`` (optional mapping): each name must match ``^[A-Z][A-Z0-9_]*$``,
each value must
+ be a string. The interpreter writes ``export NAME='value'`` lines that
the entrypoint
+ hook sources into the shell that subsequently runs ``uv sync
--all-extras``.
+
+ Any unknown top-level or per-entry key is a hard error.
+
+4. **Mock the module in unit tests** by injecting ``sys.modules["<module>"] =
MagicMock()`` in
+ the provider's ``tests/conftest.py`` before the provider's hooks/operators
get imported.
+ Regular CI does not install the real package, so the mock is what makes
import-time
+ ``from <module> import …`` succeed during normal test runs.
Provider's cross-dependencies
-----------------------------
diff --git a/scripts/docker/entrypoint_ci.sh b/scripts/docker/entrypoint_ci.sh
index 03f761f8584..aa5edbd9095 100755
--- a/scripts/docker/entrypoint_ci.sh
+++ b/scripts/docker/entrypoint_ci.sh
@@ -406,6 +406,43 @@ function reinstall_shared_distributions() {
uv pip install --no-deps $(ls -d /opt/airflow/shared/*/)
}
+# Providers whose `uv sync --all-extras` (run below by
check_force_lowest_dependencies)
+# needs additional native build prerequisites installed at test time rather
than baked
+# into the base CI image. Each listed provider must ship a declarative
manifest at
+# providers/<id>/pre_extras_install.yaml; the manifest is interpreted by
+# scripts/in_container/run_pre_extras_install.py, which restricts allowed
operations
+# to pinned-checksum downloads, archive extraction under /opt or /tmp, and
env-var
+# export. Providers cannot run arbitrary code through this hook. Maintainers
should
+# review every addition to this list as a privileged change. See
+# contributing-docs/12_provider_distributions.rst.
+PROVIDERS_NEEDING_PRE_EXTRAS_INSTALL=()
+
+function run_pre_extras_install_if_registered() {
+ local provider_id="${1}"
+ local registered_provider
+ for registered_provider in "${PROVIDERS_NEEDING_PRE_EXTRAS_INSTALL[@]}"; do
+ if [[ "${registered_provider}" == "${provider_id}" ]]; then
+ echo
+ echo "${COLOR_BLUE}Running pre-extras install manifest for
${provider_id}${COLOR_RESET}"
+ echo
+ local env_file
+ env_file=$(mktemp)
+ if ! python
"${AIRFLOW_SOURCES}/scripts/in_container/run_pre_extras_install.py" \
+ "${provider_id}" --emit-env-to "${env_file}"; then
+ rm -f "${env_file}"
+ echo "${COLOR_RED}Pre-extras install failed for
${provider_id}${COLOR_RESET}"
+ exit 1
+ fi
+ if [[ -s "${env_file}" ]]; then
+ # shellcheck disable=SC1090
+ source "${env_file}"
+ fi
+ rm -f "${env_file}"
+ return
+ fi
+ done
+}
+
function check_force_lowest_dependencies() {
if [[ ${FORCE_LOWEST_DEPENDENCIES=} != "true" ]]; then
return
@@ -424,6 +461,7 @@ function check_force_lowest_dependencies() {
exit 0
fi
cd "${AIRFLOW_SOURCES}/providers/${provider_id/.//}" || exit 1
+ run_pre_extras_install_if_registered "${provider_id}"
# --no-binary is needed in order to avoid libxml and xmlsec using
different version of libxml2
# (binary lxml embeds its own libxml2, while xmlsec uses system one).
# See https://bugs.launchpad.net/lxml/+bug/2110068
diff --git a/scripts/in_container/run_pre_extras_install.py
b/scripts/in_container/run_pre_extras_install.py
new file mode 100644
index 00000000000..3bb6ad28aa5
--- /dev/null
+++ b/scripts/in_container/run_pre_extras_install.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Interpret a provider's pre_extras_install.yaml manifest.
+
+This is the only code that runs when a provider is registered in
+PROVIDERS_NEEDING_PRE_EXTRAS_INSTALL inside scripts/docker/entrypoint_ci.sh.
+The manifest is data, not code: providers can declare pinned-checksum
+downloads, archive extractions under /opt or /tmp, and env-var exports, but
+nothing else. Maintainers reviewing a provider's manifest only need to verify
+that the URL and sha256 belong to a trusted upstream.
+
+See contributing-docs/12_provider_distributions.rst for the format.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import re
+import shlex
+import sys
+import tarfile
+import tempfile
+import urllib.request
+import zipfile
+from pathlib import Path
+from typing import NoReturn
+
+import yaml
+
+PROVIDERS_ROOT = Path("/opt/airflow/providers")
+ALLOWED_EXTRACT_PREFIXES = ("/opt/", "/tmp/")
+ENV_NAME_RE = re.compile(r"^[A-Z][A-Z0-9_]*$")
+SHA256_RE = re.compile(r"^[0-9a-f]{64}$")
+ALLOWED_TOP_LEVEL_KEYS = {"downloads", "env"}
+ALLOWED_DOWNLOAD_KEYS = {"url", "sha256", "extract_to"}
+
+
+def fail(msg: str) -> NoReturn:
+ print(f"ERROR: {msg}", file=sys.stderr)
+ sys.exit(1)
+
+
+def validate_manifest(manifest: object, provider_id: str) -> dict:
+ if not isinstance(manifest, dict):
+ fail(f"manifest for {provider_id} must be a mapping")
+ extra = set(manifest) - ALLOWED_TOP_LEVEL_KEYS
+ if extra:
+ fail(f"manifest for {provider_id} has unknown top-level keys:
{sorted(extra)}")
+ downloads = manifest.get("downloads", [])
+ if not isinstance(downloads, list):
+ fail("'downloads' must be a list")
+ for i, entry in enumerate(downloads):
+ if not isinstance(entry, dict):
+ fail(f"downloads[{i}] must be a mapping")
+ unknown = set(entry) - ALLOWED_DOWNLOAD_KEYS
+ if unknown:
+ fail(f"downloads[{i}] has unknown keys: {sorted(unknown)}")
+ missing = ALLOWED_DOWNLOAD_KEYS - set(entry)
+ if missing:
+ fail(f"downloads[{i}] is missing required keys: {sorted(missing)}")
+ url = entry["url"]
+ if not isinstance(url, str) or not url.startswith("https://"):
+ fail(f"downloads[{i}].url must be an https:// string (got
{url!r})")
+ sha256 = entry["sha256"]
+ if not isinstance(sha256, str) or not SHA256_RE.match(sha256):
+ fail(f"downloads[{i}].sha256 must be 64 lowercase hex chars")
+ extract_to = entry["extract_to"]
+ if not isinstance(extract_to, str) or not any(
+ extract_to.startswith(prefix) for prefix in
ALLOWED_EXTRACT_PREFIXES
+ ):
+ fail(
+ f"downloads[{i}].extract_to must start with one of
{ALLOWED_EXTRACT_PREFIXES} "
+ f"(got {extract_to!r})"
+ )
+ if ".." in Path(extract_to).parts:
+ fail(f"downloads[{i}].extract_to cannot contain '..'")
+ env = manifest.get("env", {})
+ if not isinstance(env, dict):
+ fail("'env' must be a mapping")
+ for name, value in env.items():
+ if not isinstance(name, str) or not ENV_NAME_RE.match(name):
+ fail(f"env name {name!r} must match {ENV_NAME_RE.pattern}")
+ if not isinstance(value, str):
+ fail(f"env value for {name} must be a string (got
{type(value).__name__})")
+ return manifest
+
+
+def download_with_checksum(url: str, expected_sha256: str, dest: Path) -> None:
+ print(f"Downloading {url}")
+ digest = hashlib.sha256()
+ with urllib.request.urlopen(url) as response, dest.open("wb") as out:
+ while True:
+ chunk = response.read(64 * 1024)
+ if not chunk:
+ break
+ digest.update(chunk)
+ out.write(chunk)
+ got = digest.hexdigest()
+ if got != expected_sha256:
+ fail(f"sha256 mismatch for {url}: expected {expected_sha256}, got
{got}")
+
+
+def safe_extract(archive: Path, target: Path) -> None:
+ target = target.resolve()
+ target.mkdir(parents=True, exist_ok=True)
+ name = archive.name.lower()
+ if name.endswith((".tar.gz", ".tgz", ".tar")):
+ with tarfile.open(archive) as tf:
+ for member in tf.getmembers():
+ member_path = (target / member.name).resolve()
+ if member_path != target and target not in member_path.parents:
+ fail(f"archive entry escapes target: {member.name}")
+ tf.extractall(target)
+ elif name.endswith(".zip"):
+ with zipfile.ZipFile(archive) as zf:
+ for member_name in zf.namelist():
+ member_path = (target / member_name).resolve()
+ if member_path != target and target not in member_path.parents:
+ fail(f"archive entry escapes target: {member_name}")
+ zf.extractall(target)
+ else:
+ fail(f"unsupported archive extension: {archive.name}")
+
+
+def manifest_path_for(provider_id: str) -> Path:
+ if not re.match(r"^[a-z0-9]+(?:[._-][a-z0-9]+)*$", provider_id):
+ fail(f"invalid provider id: {provider_id!r}")
+ return PROVIDERS_ROOT / provider_id.replace(".", "/") /
"pre_extras_install.yaml"
+
+
+def emit_env_file(env: dict, env_file: Path) -> None:
+ lines = [f"export {name}={shlex.quote(value)}" for name, value in
env.items()]
+ env_file.write_text("\n".join(lines) + ("\n" if lines else ""))
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("provider_id", help="Dotted provider id (e.g. ibm.mq)")
+ parser.add_argument(
+ "--emit-env-to",
+ required=True,
+ type=Path,
+ help="Path to write the export statements for env vars defined by the
manifest",
+ )
+ args = parser.parse_args()
+
+ manifest_file = manifest_path_for(args.provider_id)
+ if not manifest_file.is_file():
+ fail(f"manifest not found: {manifest_file}")
+ with manifest_file.open() as fh:
+ manifest = validate_manifest(yaml.safe_load(fh), args.provider_id)
+
+ with tempfile.TemporaryDirectory(
+ prefix=f"pre_extras_install_{args.provider_id.replace('.', '_')}_"
+ ) as tmpdir:
+ tmp = Path(tmpdir)
+ for index, entry in enumerate(manifest.get("downloads", [])):
+ archive_name = Path(entry["url"]).name or f"download_{index}"
+ archive = tmp / f"{index}_{archive_name}"
+ download_with_checksum(entry["url"], entry["sha256"], archive)
+ safe_extract(archive, Path(entry["extract_to"]))
+ archive.unlink(missing_ok=True)
+
+ emit_env_file(manifest.get("env", {}), args.emit_env_to)
+
+
+if __name__ == "__main__":
+ main()