This is an automated email from the ASF dual-hosted git repository.
vatsrahul1001 pushed a commit to branch v3-2-test
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/v3-2-test by this push:
new a70a9feb135 Filter unreleased phantom versions from registry build
(#65984) (#66902)
a70a9feb135 is described below
commit a70a9feb135abe51ba67130ba4f36a792a41898c
Author: Rahul Vats <[email protected]>
AuthorDate: Thu May 14 15:16:46 2026 +0530
Filter unreleased phantom versions from registry build (#65984) (#66902)
`extract_metadata.py` took the top entry of `provider.yaml`'s `versions:`
list as a provider's "latest" version with no verification that a real
release tag exists. Provider release prep prepends the next version to
`versions:` BEFORE the tag lands, and pre-release-only versions match
`versions:` but have no final tag. Without filtering, the registry ships
phantom "latest" pointers to non-existent PyPI releases / GitHub tags /
docs pages.
Concrete cases this PR catches:
- `providers/celery/provider.yaml` lists `3.19.0` at the top, but only
`providers-celery/3.19.0rc1` and `rc2` tags exist -- no final.
- `providers/akeyless/` is brand-new in-tree with `versions: [1.0.0]`
but no `providers-akeyless/*` tag.
The fix loads all `providers-<id>/<version>` git tags once via
`git tag --list 'providers-*'`, walks each provider's `versions:` list
newest-first, picks the first entry with a matching tag for the singular
`version` (latest) field, and filters the `versions` (list) field to the
same tagged subset. Providers with NO version that has a matching tag are
skipped from the registry entirely (rather than emitted with phantom
pointers).
Also filters the `versions` list -- not just the singular `version` -- so
downstream consumers like `extract_versions.py`'s backfill don't try to
extract from non-existent tags.
`registry-build.yml`'s checkout now sets `fetch-tags: true`. Without it
the default `fetch-depth: 1` checkout has no tags, the filter silently
returns an empty set, and the script falls back to the unfiltered
behaviour. `registry-backfill.yml`'s primary checkout already uses
`fetch-depth: 0` so tags are present there.
Tests: TestLoadReleaseTags (3 cases: parsing, subprocess error, missing
git binary), TestFindLatestReleasedVersion (6 cases including phantom
top, RC-only, cross-provider mismatch, empty list), and
TestVersionsListFiltering (3 cases asserting the list is filtered in
parallel with the latest pointer).
(cherry picked from commit 38d8d419ea67dccaa78a33010afe846aa21a1985)
Co-authored-by: Kaxil Naik <[email protected]>
---
.github/workflows/registry-build.yml | 18 ++-
.../doc/images/output_registry_extract-data.svg | 37 ++++--
.../doc/images/output_registry_extract-data.txt | 2 +-
.../airflow_breeze/commands/registry_commands.py | 19 ++-
.../commands/registry_commands_config.py | 1 +
dev/registry/extract_metadata.py | 109 +++++++++++++++-
dev/registry/tests/test_extract_metadata.py | 144 +++++++++++++++++++++
7 files changed, 310 insertions(+), 20 deletions(-)
diff --git a/.github/workflows/registry-build.yml
b/.github/workflows/registry-build.yml
index 22eb96eca86..ed9f97e901a 100644
--- a/.github/workflows/registry-build.yml
+++ b/.github/workflows/registry-build.yml
@@ -115,6 +115,11 @@ jobs:
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd #
v6.0.2
with:
persist-credentials: false
+ # Tags drive the phantom-version filter in extract_metadata.py
+ # (only versions with a real `providers-<id>/<ver>` tag are
+ # treated as released). Without this, the filter silently
+ # falls back to `versions[0]` and ships phantom versions.
+ fetch-tags: true
- name: "Prepare breeze & CI image"
uses: ./.github/actions/prepare_breeze_and_image
@@ -176,11 +181,20 @@ jobs:
- name: "Extract registry data (breeze)"
env:
PROVIDER: ${{ inputs.provider }}
+ DESTINATION: ${{ inputs.destination }}
run: |
+ # Staging dispatches preview unreleased providers (maintainers want
to
+ # verify newly-bumped versions look right before tagging). Live
builds
+ # filter them so the production registry never ships pointers to
+ # non-existent PyPI releases / GitHub tags / docs pages.
+ ALLOW_UNRELEASED=""
+ if [[ "${DESTINATION}" == "staging" ]]; then
+ ALLOW_UNRELEASED="--allow-unreleased"
+ fi
if [[ -n "${PROVIDER}" ]]; then
- breeze registry extract-data --python 3.12 --provider "${PROVIDER}"
+ breeze registry extract-data --python 3.12 --provider
"${PROVIDER}" ${ALLOW_UNRELEASED}
else
- breeze registry extract-data --python 3.12
+ breeze registry extract-data --python 3.12 ${ALLOW_UNRELEASED}
fi
# --- Incremental: merge new data with existing ---
diff --git a/dev/breeze/doc/images/output_registry_extract-data.svg
b/dev/breeze/doc/images/output_registry_extract-data.svg
index c1c5d148f7b..c7dda404a56 100644
--- a/dev/breeze/doc/images/output_registry_extract-data.svg
+++ b/dev/breeze/doc/images/output_registry_extract-data.svg
@@ -1,4 +1,4 @@
-<svg class="rich-terminal" viewBox="0 0 1482 416.0"
xmlns="http://www.w3.org/2000/svg">
+<svg class="rich-terminal" viewBox="0 0 1482 489.2"
xmlns="http://www.w3.org/2000/svg">
<!-- Generated with Rich https://www.textualize.io -->
<style>
@@ -39,11 +39,12 @@
.breeze-registry-extract-data-r5 { fill: #868887 }
.breeze-registry-extract-data-r6 { fill: #98a84b;font-weight: bold }
.breeze-registry-extract-data-r7 { fill: #8d7b39 }
+.breeze-registry-extract-data-r8 { fill: #d0b344;font-weight: bold }
</style>
<defs>
<clipPath id="breeze-registry-extract-data-clip-terminal">
- <rect x="0" y="0" width="1463.0" height="365.0" />
+ <rect x="0" y="0" width="1463.0" height="438.2" />
</clipPath>
<clipPath id="breeze-registry-extract-data-line-0">
<rect x="0" y="1.5" width="1464" height="24.65"/>
@@ -87,9 +88,18 @@
<clipPath id="breeze-registry-extract-data-line-13">
<rect x="0" y="318.7" width="1464" height="24.65"/>
</clipPath>
+<clipPath id="breeze-registry-extract-data-line-14">
+ <rect x="0" y="343.1" width="1464" height="24.65"/>
+ </clipPath>
+<clipPath id="breeze-registry-extract-data-line-15">
+ <rect x="0" y="367.5" width="1464" height="24.65"/>
+ </clipPath>
+<clipPath id="breeze-registry-extract-data-line-16">
+ <rect x="0" y="391.9" width="1464" height="24.65"/>
+ </clipPath>
</defs>
- <rect fill="#292929" stroke="rgba(255,255,255,0.35)" stroke-width="1"
x="1" y="1" width="1480" height="414" rx="8"/><text
class="breeze-registry-extract-data-title" fill="#c5c8c6" text-anchor="middle"
x="740" y="27">Command: registry extract-data</text>
+ <rect fill="#292929" stroke="rgba(255,255,255,0.35)" stroke-width="1"
x="1" y="1" width="1480" height="487.2" rx="8"/><text
class="breeze-registry-extract-data-title" fill="#c5c8c6" text-anchor="middle"
x="740" y="27">Command: registry extract-data</text>
<g transform="translate(26,22)">
<circle cx="0" cy="0" r="7" fill="#ff5f57"/>
<circle cx="22" cy="0" r="7" fill="#febc2e"/>
@@ -105,15 +115,18 @@
</text><text class="breeze-registry-extract-data-r1" x="12.2" y="93.2"
textLength="939.4"
clip-path="url(#breeze-registry-extract-data-line-3)">Extract provider metadata, parameters, and connection types for the registry.</text><text
class="breeze-registry-extract-data-r1" x="1464" y="93.2" textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-3)">
</text><text class="breeze-registry-extract-data-r1" x="1464" y="117.6"
textLength="12.2" clip-path="url(#breeze-registry-extract-data-line-4)">
</text><text class="breeze-registry-extract-data-r5" x="0" y="142"
textLength="24.4"
clip-path="url(#breeze-registry-extract-data-line-5)">╭─</text><text
class="breeze-registry-extract-data-r5" x="24.4" y="142" textLength="244"
clip-path="url(#breeze-registry-extract-data-line-5)"> Extract data flags </text><text
class="breeze-registry-extract-data-r5" x="268.4" y="142" textLength="1171.2"
clip-path="url(#breeze-registry-extract-data-line-5)">─────────────────────────
[...]
-</text><text class="breeze-registry-extract-data-r5" x="0" y="166.4"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-6)">│</text><text
class="breeze-registry-extract-data-r4" x="24.4" y="166.4" textLength="122"
clip-path="url(#breeze-registry-extract-data-line-6)">--python  </text><text
class="breeze-registry-extract-data-r6" x="170.8" y="166.4" textLength="24.4"
clip-path="url(#breeze-registry-extract-data-line-6)">-p</text><text
class="breeze-registry-extr [...]
-</text><text class="breeze-registry-extract-data-r5" x="0" y="190.8"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-7)">│</text><text
class="breeze-registry-extract-data-r7" x="219.6" y="190.8" textLength="146.4"
clip-path="url(#breeze-registry-extract-data-line-7)">3.13 | 3.14)</text><text
class="breeze-registry-extract-data-r5" x="1451.8" y="190.8" textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-7)">│</text><text
class="breeze-registry [...]
-</text><text class="breeze-registry-extract-data-r5" x="0" y="215.2"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-8)">│</text><text
class="breeze-registry-extract-data-r4" x="24.4" y="215.2" textLength="122"
clip-path="url(#breeze-registry-extract-data-line-8)">--provider</text><text
class="breeze-registry-extract-data-r1" x="219.6" y="215.2" textLength="829.6"
clip-path="url(#breeze-registry-extract-data-line-8)">Extract only this provider ID&#
[...]
-</text><text class="breeze-registry-extract-data-r5" x="0" y="239.6"
textLength="1464"
clip-path="url(#breeze-registry-extract-data-line-9)">╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</text><text
class="breeze-registry-extract-data-r1" x="1464" y="239.6" textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-9)">
-</text><text class="breeze-registry-extract-data-r5" x="0" y="264"
textLength="24.4"
clip-path="url(#breeze-registry-extract-data-line-10)">╭─</text><text
class="breeze-registry-extract-data-r5" x="24.4" y="264" textLength="195.2"
clip-path="url(#breeze-registry-extract-data-line-10)"> Common options </text><text
class="breeze-registry-extract-data-r5" x="219.6" y="264" textLength="1220"
clip-path="url(#breeze-registry-extract-data-line-10)">───────────────────────────────
[...]
-</text><text class="breeze-registry-extract-data-r5" x="0" y="288.4"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-11)">│</text><text
class="breeze-registry-extract-data-r4" x="24.4" y="288.4" textLength="109.8"
clip-path="url(#breeze-registry-extract-data-line-11)">--verbose</text><text
class="breeze-registry-extract-data-r6" x="158.6" y="288.4" textLength="24.4"
clip-path="url(#breeze-registry-extract-data-line-11)">-v</text><text
class="breeze-registry-extract-da [...]
-</text><text class="breeze-registry-extract-data-r5" x="0" y="312.8"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-12)">│</text><text
class="breeze-registry-extract-data-r4" x="24.4" y="312.8" textLength="109.8"
clip-path="url(#breeze-registry-extract-data-line-12)">--dry-run</text><text
class="breeze-registry-extract-data-r6" x="158.6" y="312.8" textLength="24.4"
clip-path="url(#breeze-registry-extract-data-line-12)">-D</text><text
class="breeze-registry-extract-da [...]
-</text><text class="breeze-registry-extract-data-r5" x="0" y="337.2"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-13)">│</text><text
class="breeze-registry-extract-data-r4" x="24.4" y="337.2" textLength="109.8"
clip-path="url(#breeze-registry-extract-data-line-13)">--help   </text><text
class="breeze-registry-extract-data-r6" x="158.6" y="337.2" textLength="24.4"
clip-path="url(#breeze-registry-extract-data-line-13)">-h</text><text
class="breeze-regi [...]
-</text><text class="breeze-registry-extract-data-r5" x="0" y="361.6"
textLength="1464"
clip-path="url(#breeze-registry-extract-data-line-14)">╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</text><text
class="breeze-registry-extract-data-r1" x="1464" y="361.6" textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-14)">
+</text><text class="breeze-registry-extract-data-r5" x="0" y="166.4"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-6)">│</text><text
class="breeze-registry-extract-data-r4" x="24.4" y="166.4" textLength="219.6"
clip-path="url(#breeze-registry-extract-data-line-6)">--python          </text><text
class="breeze-registry-extract-data-r6" x="268.4" y="166.4" textLength="24.4"
clip-path="url(#breeze-registry-extract-data-l [...]
+</text><text class="breeze-registry-extract-data-r5" x="0" y="190.8"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-7)">│</text><text
class="breeze-registry-extract-data-r7" x="317.2" y="190.8" textLength="256.2"
clip-path="url(#breeze-registry-extract-data-line-7)">| 3.12 | 3.13 | 3.14)</text><text
class="breeze-registry-extract-data-r5" x="1451.8" y="190.8" textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-7)">│</text><tex [...]
+</text><text class="breeze-registry-extract-data-r5" x="0" y="215.2"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-8)">│</text><text
class="breeze-registry-extract-data-r4" x="24.4" y="215.2" textLength="219.6"
clip-path="url(#breeze-registry-extract-data-line-8)">--provider        </text><text
class="breeze-registry-extract-data-r1" x="317.2" y="215.2" textLength="829.6"
clip-path="url(#breeze-registry-extract-data-line-8)">E [...]
+</text><text class="breeze-registry-extract-data-r5" x="0" y="239.6"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-9)">│</text><text
class="breeze-registry-extract-data-r4" x="24.4" y="239.6" textLength="219.6"
clip-path="url(#breeze-registry-extract-data-line-9)">--allow-unreleased</text><text
class="breeze-registry-extract-data-r1" x="317.2" y="239.6" textLength="829.6"
clip-path="url(#breeze-registry-extract-data-line-9)">Include providers and vers
[...]
+</text><text class="breeze-registry-extract-data-r5" x="0" y="264"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-10)">│</text><text
class="breeze-registry-extract-data-r1" x="317.2" y="264" textLength="1122.4"
clip-path="url(#breeze-registry-extract-data-line-10)">for staging builds and local dev where maintainers want to preview unreleased provider pages</text><text
class="breeze-registry-extract-data [...]
+</text><text class="breeze-registry-extract-data-r5" x="0" y="288.4"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-11)">│</text><text
class="breeze-registry-extract-data-r1" x="317.2" y="288.4" textLength="1122.4"
clip-path="url(#breeze-registry-extract-data-line-11)">before the tag lands. Forwarded to extract_metadata.py.                   &#
[...]
+</text><text class="breeze-registry-extract-data-r5" x="0" y="312.8"
textLength="1464"
clip-path="url(#breeze-registry-extract-data-line-12)">╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</text><text
class="breeze-registry-extract-data-r1" x="1464" y="312.8" textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-12)">
+</text><text class="breeze-registry-extract-data-r5" x="0" y="337.2"
textLength="24.4"
clip-path="url(#breeze-registry-extract-data-line-13)">╭─</text><text
class="breeze-registry-extract-data-r5" x="24.4" y="337.2" textLength="195.2"
clip-path="url(#breeze-registry-extract-data-line-13)"> Common options </text><text
class="breeze-registry-extract-data-r5" x="219.6" y="337.2" textLength="1220"
clip-path="url(#breeze-registry-extract-data-line-13)">─────────────────────────
[...]
+</text><text class="breeze-registry-extract-data-r5" x="0" y="361.6"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-14)">│</text><text
class="breeze-registry-extract-data-r4" x="24.4" y="361.6" textLength="109.8"
clip-path="url(#breeze-registry-extract-data-line-14)">--verbose</text><text
class="breeze-registry-extract-data-r6" x="158.6" y="361.6" textLength="24.4"
clip-path="url(#breeze-registry-extract-data-line-14)">-v</text><text
class="breeze-registry-extract-da [...]
+</text><text class="breeze-registry-extract-data-r5" x="0" y="386"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-15)">│</text><text
class="breeze-registry-extract-data-r4" x="24.4" y="386" textLength="109.8"
clip-path="url(#breeze-registry-extract-data-line-15)">--dry-run</text><text
class="breeze-registry-extract-data-r6" x="158.6" y="386" textLength="24.4"
clip-path="url(#breeze-registry-extract-data-line-15)">-D</text><text
class="breeze-registry-extract-data-r1" [...]
+</text><text class="breeze-registry-extract-data-r5" x="0" y="410.4"
textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-16)">│</text><text
class="breeze-registry-extract-data-r4" x="24.4" y="410.4" textLength="109.8"
clip-path="url(#breeze-registry-extract-data-line-16)">--help   </text><text
class="breeze-registry-extract-data-r6" x="158.6" y="410.4" textLength="24.4"
clip-path="url(#breeze-registry-extract-data-line-16)">-h</text><text
class="breeze-regi [...]
+</text><text class="breeze-registry-extract-data-r5" x="0" y="434.8"
textLength="1464"
clip-path="url(#breeze-registry-extract-data-line-17)">╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯</text><text
class="breeze-registry-extract-data-r1" x="1464" y="434.8" textLength="12.2"
clip-path="url(#breeze-registry-extract-data-line-17)">
</text>
</g>
</g>
diff --git a/dev/breeze/doc/images/output_registry_extract-data.txt
b/dev/breeze/doc/images/output_registry_extract-data.txt
index a5b0e70c052..be322d50355 100644
--- a/dev/breeze/doc/images/output_registry_extract-data.txt
+++ b/dev/breeze/doc/images/output_registry_extract-data.txt
@@ -1 +1 @@
-602ea508f9bcf0d5c2f97a220f5ee6d2
+5ece18e98af19619094e0ee3c439b73b
diff --git a/dev/breeze/src/airflow_breeze/commands/registry_commands.py
b/dev/breeze/src/airflow_breeze/commands/registry_commands.py
index 0655deb51d1..f831f727404 100644
--- a/dev/breeze/src/airflow_breeze/commands/registry_commands.py
+++ b/dev/breeze/src/airflow_breeze/commands/registry_commands.py
@@ -67,9 +67,20 @@ def registry_group():
default=None,
help="Extract only this provider ID (e.g. 'amazon'). Omit for full build.",
)
[email protected](
+ "--allow-unreleased",
+ is_flag=True,
+ default=False,
+ help=(
+ "Include providers and versions that don't have a matching "
+ "providers-<id>/<ver> git tag. Use for staging builds and local dev "
+ "where maintainers want to preview unreleased provider pages before "
+ "the tag lands. Forwarded to extract_metadata.py."
+ ),
+)
@option_verbose
@option_dry_run
-def extract_data(python: str, provider: str | None):
+def extract_data(python: str, provider: str | None, allow_unreleased: bool):
unique_project_name = f"breeze-registry-{uuid.uuid4().hex[:8]}"
shell_params = ShellParams(
@@ -88,9 +99,13 @@ def extract_data(python: str, provider: str | None):
install_cmd = f"pip install --quiet {' '.join(suspended_packages)} && " if
suspended_packages else ""
provider_flag = f" --provider '{provider}'" if provider else ""
+ # --allow-unreleased only applies to extract_metadata.py (which owns the
+ # version filter). The other two scripts read from providers.json and
+ # don't need it.
+ metadata_extra = " --allow-unreleased" if allow_unreleased else ""
command = (
f"{install_cmd}"
- f"python dev/registry/extract_metadata.py{provider_flag} && "
+ f"python
dev/registry/extract_metadata.py{provider_flag}{metadata_extra} && "
f"python dev/registry/extract_parameters.py{provider_flag} && "
f"python dev/registry/extract_connections.py{provider_flag}"
)
diff --git a/dev/breeze/src/airflow_breeze/commands/registry_commands_config.py
b/dev/breeze/src/airflow_breeze/commands/registry_commands_config.py
index fdd156d45a3..24b4e870ab0 100644
--- a/dev/breeze/src/airflow_breeze/commands/registry_commands_config.py
+++ b/dev/breeze/src/airflow_breeze/commands/registry_commands_config.py
@@ -32,6 +32,7 @@ REGISTRY_PARAMETERS: dict[str, list[dict[str, str |
list[str]]]] = {
"options": [
"--python",
"--provider",
+ "--allow-unreleased",
],
},
],
diff --git a/dev/registry/extract_metadata.py b/dev/registry/extract_metadata.py
index 3dd208e3cfd..19b32dcbf18 100644
--- a/dev/registry/extract_metadata.py
+++ b/dev/registry/extract_metadata.py
@@ -35,6 +35,7 @@ import datetime
import json
import re
import shutil
+import subprocess
import urllib.request
import zlib
from dataclasses import asdict, dataclass, field
@@ -363,6 +364,48 @@ def find_related_providers(provider_id: str,
all_provider_yamls: dict[str, dict]
return related[:5] # Limit to 5 related providers
+def load_release_tags() -> set[str]:
+ """Return all ``providers-<id>/<version>`` git tags as a set for fast
lookup.
+
+ Used to filter ``provider.yaml`` ``versions:`` lists to only entries that
+ correspond to a real release (excludes phantom version bumps where the
+ next-version entry was prepended to ``versions:`` before the tag landed,
+ or pre-release-only versions like ``providers-celery/3.19.0rc1`` where the
+ ``rc1`` exists but the final does not).
+
+ Returns an empty set if the ``git`` command fails (e.g., outside a
checkout);
+ callers can decide whether to fall back to the unfiltered top entry.
+ """
+ try:
+ result = subprocess.run(
+ ["git", "tag", "--list", "providers-*"],
+ capture_output=True,
+ text=True,
+ cwd=AIRFLOW_ROOT,
+ check=True,
+ )
+ except (subprocess.CalledProcessError, FileNotFoundError):
+ return set()
+ return {line.strip() for line in result.stdout.splitlines() if
line.strip()}
+
+
+def find_latest_released_version(
+ provider_id: str,
+ versions_list: list[str],
+ release_tags: set[str],
+) -> str | None:
+ """Walk ``versions_list`` newest-first, return the first version with a
real release tag.
+
+ Returns ``None`` when no entry in ``versions_list`` has a corresponding
+ ``providers-<id>/<version>`` tag, indicating the provider is unreleased
+ (brand-new in-tree, no tags yet) or in an inconsistent state.
+ """
+ for version in versions_list:
+ if f"providers-{provider_id}/{version}" in release_tags:
+ return version
+ return None
+
+
def main():
"""Main extraction function."""
import argparse
@@ -373,6 +416,17 @@ def main():
default=None,
help="Extract only this provider ID (e.g. 'amazon'). Omit for full
build.",
)
+ parser.add_argument(
+ "--allow-unreleased",
+ action="store_true",
+ help=(
+ "Include providers and versions that don't have a matching "
+ "providers-<id>/<ver> git tag. Use for staging builds and local
dev "
+ "where maintainers want to preview unreleased provider pages
before "
+ "the tag lands. Default is to filter unreleased entries so live "
+ "builds don't ship phantom pointers."
+ ),
+ )
args = parser.parse_args()
print("Airflow Registry Metadata Extractor")
@@ -382,6 +436,8 @@ def main():
print(f"Incremental mode: extracting provider(s)
{requested_providers}")
else:
requested_providers = None
+ if args.allow_unreleased:
+ print("Unreleased providers: INCLUDED (--allow-unreleased)")
# Ensure output directory exists
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
@@ -419,6 +475,27 @@ def main():
else:
extraction_ids = set(all_provider_yamls.keys())
+ # Load all release tags once. Used below to filter `provider.yaml`'s
+ # `versions:` to only entries that have a real `providers-<id>/<ver>`
+ # git tag, avoiding phantom-version leaks (next-release bumps prepended
+ # to `versions:` before the tag lands, RC-only releases, brand-new
+ # providers with no tags yet).
+ #
+ # Skipped entirely when --allow-unreleased is set: staging builds and
+ # local dev want to preview unreleased provider pages so maintainers
+ # can verify them before tagging.
+ if args.allow_unreleased:
+ release_tags: set[str] = set()
+ else:
+ release_tags = load_release_tags()
+ if not release_tags:
+ print(
+ " Warning: no providers-* git tags found; "
+ "phantom version filter is disabled (falling back to
versions[0]). "
+ "If this is a CI run, ensure the checkout step uses
fetch-tags: true."
+ )
+ skipped_unreleased: list[str] = []
+
# Second pass: Extract full metadata (only for providers in extraction_ids)
for provider_id in extraction_ids:
provider_yaml = all_provider_yamls[provider_id]
@@ -451,9 +528,29 @@ def main():
if len(description) > 200:
description = description[:197] + "..."
- # Get versions
- versions = provider_yaml.get("versions", [])
- version = versions[0] if versions else "0.0.0"
+ # Get versions, filtering to entries that have a real release tag.
+ # Provider release prep prepends the next version to `versions:` BEFORE
+ # the tag lands, and pre-release-only versions match `versions:` but
+ # have no final tag. Without filtering, `version` (the latest pointer)
+ # AND the `versions` list both leak phantoms downstream -- the latter
+ # is consumed by extract_versions.py's backfill, which would try to
+ # `git show` from a non-existent tag.
+ raw_versions = provider_yaml.get("versions", [])
+ if release_tags:
+ versions = [v for v in raw_versions if
f"providers-{provider_id}/{v}" in release_tags]
+ version = find_latest_released_version(provider_id, raw_versions,
release_tags)
+ if version is None:
+ skipped_unreleased.append(provider_id)
+ print(
+ f" Skipping {provider_id}: no released version found in "
+ f"versions list {raw_versions} "
+ f"(no matching providers-{provider_id}/<ver> tag)"
+ )
+ continue
+ else:
+ # No tag information available -- fall back to old behaviour.
+ versions = list(raw_versions)
+ version = versions[0] if versions else "0.0.0"
# Extract categories from integrations
categories = extract_integrations_as_categories(provider_yaml)
@@ -607,6 +704,12 @@ def main():
all_providers.append(provider)
print(f" {provider_id}: {len(categories)} categories")
+ if skipped_unreleased:
+ print(
+ f"\nSkipped {len(skipped_unreleased)} unreleased provider(s) "
+ f"(no matching git tag): {sorted(skipped_unreleased)}"
+ )
+
# Find related providers
for provider in all_providers:
provider.related_providers = find_related_providers(provider.id,
all_provider_yamls)
diff --git a/dev/registry/tests/test_extract_metadata.py
b/dev/registry/tests/test_extract_metadata.py
index 5bbccd7cb9b..df7b2de71b5 100644
--- a/dev/registry/tests/test_extract_metadata.py
+++ b/dev/registry/tests/test_extract_metadata.py
@@ -31,7 +31,9 @@ from extract_metadata import (
fetch_provider_inventory,
fetch_pypi_dates,
fetch_pypi_downloads,
+ find_latest_released_version,
find_related_providers,
+ load_release_tags,
module_path_to_file_path,
parse_pyproject_toml,
read_connection_urls,
@@ -568,3 +570,145 @@ class TestResolveConnectionDocsUrl:
conn_map = {"tableau": "connections/tableau.html"}
url = resolve_connection_docs_url("tableau", conn_map, self.BASE)
assert url == f"{self.BASE}/connections/tableau.html"
+
+
+# ---------------------------------------------------------------------------
+# load_release_tags
+# ---------------------------------------------------------------------------
+class TestLoadReleaseTags:
+ def test_parses_subprocess_output(self):
+ from unittest.mock import MagicMock, patch
+
+ mock_result = MagicMock()
+ mock_result.stdout = (
+ "providers-amazon/9.25.0\n"
+ "providers-amazon/9.26.0\n"
+ "providers-celery/3.18.0\n"
+ "providers-celery/3.19.0rc1\n"
+ "\n" # blank line
+ " providers-google/21.2.0 \n" # whitespace tolerated
+ )
+ with patch("extract_metadata.subprocess.run",
return_value=mock_result) as mock_run:
+ tags = load_release_tags()
+
+ assert tags == {
+ "providers-amazon/9.25.0",
+ "providers-amazon/9.26.0",
+ "providers-celery/3.18.0",
+ "providers-celery/3.19.0rc1",
+ "providers-google/21.2.0",
+ }
+ # The git command runs against the providers-* glob
+ cmd = mock_run.call_args[0][0]
+ assert cmd[:3] == ["git", "tag", "--list"]
+ assert cmd[3] == "providers-*"
+
+ def test_returns_empty_set_on_subprocess_failure(self):
+ from subprocess import CalledProcessError
+ from unittest.mock import patch
+
+ with patch(
+ "extract_metadata.subprocess.run",
+ side_effect=CalledProcessError(1, ["git", "tag", "--list"]),
+ ):
+ tags = load_release_tags()
+ assert tags == set()
+
+ def test_returns_empty_set_when_git_not_installed(self):
+ from unittest.mock import patch
+
+ with patch("extract_metadata.subprocess.run",
side_effect=FileNotFoundError):
+ tags = load_release_tags()
+ assert tags == set()
+
+
+# ---------------------------------------------------------------------------
+# find_latest_released_version
+# ---------------------------------------------------------------------------
+class TestFindLatestReleasedVersion:
+ def test_returns_top_when_top_has_tag(self):
+ tags = {"providers-amazon/9.26.0", "providers-amazon/9.25.0"}
+ assert find_latest_released_version("amazon", ["9.26.0", "9.25.0"],
tags) == "9.26.0"
+
+ def test_walks_past_phantom_top(self):
+ # celery 3.19.0 is in versions: but no final tag -- only rc1.
+ tags = {
+ "providers-celery/3.19.0rc1",
+ "providers-celery/3.18.0",
+ "providers-celery/3.17.2",
+ }
+ result = find_latest_released_version("celery", ["3.19.0", "3.18.0",
"3.17.2"], tags)
+ assert result == "3.18.0"
+
+ def test_returns_none_when_no_versions_have_tags(self):
+ # akeyless: brand-new provider, listed in versions: but never tagged.
+ tags = {"providers-amazon/9.26.0"} # different provider
+ result = find_latest_released_version("akeyless", ["1.0.0"], tags)
+ assert result is None
+
+ def test_returns_none_for_empty_versions_list(self):
+ result = find_latest_released_version("amazon", [],
{"providers-amazon/9.26.0"})
+ assert result is None
+
+ def test_rc_only_treated_as_phantom(self):
+ # Final 3.19.0 is missing; rc1/rc2 exist. Final must match exactly.
+ tags = {"providers-celery/3.19.0rc1", "providers-celery/3.19.0rc2",
"providers-celery/3.18.0"}
+ # versions: list contains only the would-be final
+ result = find_latest_released_version("celery", ["3.19.0"], tags)
+ assert result is None
+ # When fallback also exists in versions, returns the fallback
+ result = find_latest_released_version("celery", ["3.19.0", "3.18.0"],
tags)
+ assert result == "3.18.0"
+
+ def test_does_not_match_other_providers_tags(self):
+ # provider id is part of the tag prefix; pure version coincidence
shouldn't match
+ tags = {"providers-google/9.26.0"}
+ result = find_latest_released_version("amazon", ["9.26.0"], tags)
+ assert result is None
+
+
+# ---------------------------------------------------------------------------
+# Filter behaviour applied to the `versions` list (not just the latest pointer)
+# ---------------------------------------------------------------------------
+class TestVersionsListFiltering:
+ """Regression test for the bug where Provider.version (singular) was
+ filtered to a real release but Provider.versions (list) still contained
+ phantom entries. Downstream consumers like extract_versions.py read the
+ list and would chase non-existent backfill tags.
+ """
+
+ def test_filter_drops_phantom_top_from_list(self):
+ # This mirrors the in-loop logic. We don't have to test main()
+ # end-to-end -- the filter is a single comprehension that we can
+ # exercise directly to lock in the contract.
+ provider_id = "celery"
+ raw_versions = ["3.19.0", "3.18.0", "3.17.2"]
+ release_tags = {
+ "providers-celery/3.19.0rc1", # not the final
+ "providers-celery/3.18.0",
+ "providers-celery/3.17.2",
+ }
+ filtered = [v for v in raw_versions if f"providers-{provider_id}/{v}"
in release_tags]
+ assert filtered == ["3.18.0", "3.17.2"]
+ # And the latest pointer agrees
+ assert find_latest_released_version(provider_id, raw_versions,
release_tags) == "3.18.0"
+
+ def test_filter_drops_unreleased_provider(self):
+ provider_id = "akeyless"
+ raw_versions = ["1.0.0"]
+ release_tags = {"providers-amazon/9.26.0"} # different provider
+ filtered = [v for v in raw_versions if f"providers-{provider_id}/{v}"
in release_tags]
+ assert filtered == []
+ assert find_latest_released_version(provider_id, raw_versions,
release_tags) is None
+
+ def test_filter_preserves_order(self):
+ provider_id = "amazon"
+ raw_versions = ["9.27.0", "9.26.0", "9.25.0", "9.24.0"] # 9.27.0
phantom
+ release_tags = {
+ "providers-amazon/9.26.0",
+ "providers-amazon/9.25.0",
+ "providers-amazon/9.24.0",
+ }
+ filtered = [v for v in raw_versions if f"providers-{provider_id}/{v}"
in release_tags]
+ # Order from raw_versions is preserved; only the phantom is dropped
+ assert filtered == ["9.26.0", "9.25.0", "9.24.0"]