(airflow) branch main updated: Fix connection docs URLs to use Sphinx inventory instead of hardcoded paths (#63349)

kaxilnaik Wed, 11 Mar 2026 07:04:48 -0700

This is an automated email from the ASF dual-hosted git repository.

kaxilnaik pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git



The following commit(s) were added to refs/heads/main by this push:
     new fab5402def1 Fix connection docs URLs to use Sphinx inventory instead 
of hardcoded paths (#63349)
fab5402def1 is described below

commit fab5402def16ceba18ac94d8f25704fef725d9f7
Author: Kaxil Naik <[email protected]>
AuthorDate: Wed Mar 11 14:04:32 2026 +0000

    Fix connection docs URLs to use Sphinx inventory instead of hardcoded paths 
(#63349)
    
    The registry was hardcoding `connections/index.html` for all connection
    type docs links, which 404s for providers like Tableau (correct URL is
    `connections/tableau.html`) and Google BigQuery 
(`connections/bigquery.html`).
    
    Now `extract_metadata.py` and `extract_versions.py` parse the provider's
    Sphinx `objects.inv` to resolve per-connection-type URLs from `std:label`
    and `std:doc` entries. Unresolved conn_types fall back to `connections/`.
    
    Also fixes `connection-builder.js` which was mangling the docs URL by
    replacing `index.html` with `{connType}.html` — producing URLs like
    `connections/gcpbigquery.html` (404) instead of using the data as-is.
---
 dev/registry/extract_metadata.py            |  71 +++++++++++--
 dev/registry/extract_versions.py            |  13 ++-
 dev/registry/tests/test_extract_metadata.py | 149 ++++++++++++++++++++++++++++
 registry/src/js/connection-builder.js       |   9 +-
 4 files changed, 224 insertions(+), 18 deletions(-)

diff --git a/dev/registry/extract_metadata.py b/dev/registry/extract_metadata.py
index 1c5047c08e1..25149e39977 100644
--- a/dev/registry/extract_metadata.py
+++ b/dev/registry/extract_metadata.py
@@ -49,9 +49,6 @@ PYPISTATS_RECENT_URL = 
"https://pypistats.org/api/packages/{package_name}/recent
 PYPI_PACKAGE_JSON_URL = "https://pypi.org/pypi/{package_name}/json";
 S3_DOC_URL = "http://apache-airflow-docs.s3-website.eu-central-1.amazonaws.com";
 AIRFLOW_PROVIDER_DOCS_URL = 
"https://airflow.apache.org/docs/{package_name}/stable/";
-AIRFLOW_PROVIDER_CONNECTIONS_URL = (
-    
"https://airflow.apache.org/docs/{package_name}/stable/connections/index.html";
-)
 AIRFLOW_PROVIDER_SOURCE_URL = (
     
"https://github.com/apache/airflow/tree/providers-{provider_id}/{version}/providers/{provider_path}";
 )
@@ -103,15 +100,18 @@ def fetch_pypi_dates(package_name: str) -> dict[str, str]:
         return {"first_released": "", "last_updated": ""}
 
 
-def read_inventory(inv_path: Path) -> dict[str, str]:
-    """Parse a Sphinx objects.inv file and return {qualified_name: url_path} 
for py:class entries."""
+def _parse_inventory_lines(inv_path: Path) -> list[str]:
+    """Read and decompress the body of a Sphinx objects.inv file."""
     with inv_path.open("rb") as f:
-        # Skip the 4 header lines
         for _ in range(4):
             f.readline()
-        data = zlib.decompress(f.read()).decode("utf-8").splitlines()
+        return zlib.decompress(f.read()).decode("utf-8").splitlines()
+
+
+def read_inventory(inv_path: Path) -> dict[str, str]:
+    """Parse a Sphinx objects.inv file and return {qualified_name: url_path} 
for py:class entries."""
     result: dict[str, str] = {}
-    for line in data:
+    for line in _parse_inventory_lines(inv_path):
         parts = line.split(None, 4)
         if len(parts) != 5:
             continue
@@ -122,6 +122,39 @@ def read_inventory(inv_path: Path) -> dict[str, str]:
     return result
 
 
+def read_connection_urls(inv_path: Path) -> dict[str, str]:
+    """Parse a Sphinx objects.inv and return {conn_type: relative_url} for 
connection pages.
+
+    Uses two inventory entry types:
+    - ``std:label howto/connection:{conn_type}`` — maps conn_type directly to 
a page
+    - ``std:doc connections/{name}`` — fallback by matching conn_type to doc 
name
+    """
+    label_map: dict[str, str] = {}  # conn_type -> page URL (from std:label)
+    doc_map: dict[str, str] = {}  # doc_name -> page URL (from std:doc)
+    for line in _parse_inventory_lines(inv_path):
+        parts = line.split(None, 4)
+        if len(parts) != 5:
+            continue
+        name, domain_role, _prio, location, _dispname = parts
+        if domain_role == "std:label" and name.startswith("howto/connection:"):
+            label_key = name[len("howto/connection:") :]
+            # Skip sub-section labels like "gcp:configuring_the_connection"
+            if ":" not in label_key:
+                label_map[label_key] = location.split("#")[0]
+        elif domain_role == "std:doc" and name.startswith("connections/"):
+            doc_name = name[len("connections/") :]
+            if doc_name != "index":
+                doc_map[doc_name] = location
+
+    # Merge: label_map takes precedence, doc_map fills gaps
+    result: dict[str, str] = {}
+    result.update(label_map)
+    for doc_name, url in doc_map.items():
+        if doc_name not in result:
+            result[doc_name] = url
+    return result
+
+
 INVENTORY_CACHE_DIR = Path(__file__).parent / ".inventory_cache"
 INVENTORY_TTL = datetime.timedelta(hours=12)
 
@@ -160,6 +193,18 @@ def fetch_provider_inventory(package_name: str, cache_dir: 
Path = INVENTORY_CACH
         return None
 
 
+def resolve_connection_docs_url(conn_type: str, conn_url_map: dict[str, str], 
base_docs_url: str) -> str:
+    """Resolve the docs URL for a connection type using the inventory map.
+
+    Lookup order:
+    1. Exact match on conn_type in the inventory map
+    2. Fallback to connections/ directory listing
+    """
+    if conn_type in conn_url_map:
+        return f"{base_docs_url}/{conn_url_map[conn_type]}"
+    return f"{base_docs_url}/connections/"
+
+
 # Base paths
 AIRFLOW_ROOT = Path(__file__).parent.parent.parent
 SCRIPT_DIR = Path(__file__).parent
@@ -495,9 +540,13 @@ def main():
                 shutil.copy2(src, registry_logos_dir / logo_filename)
 
         # Extract connection types from provider.yaml
-        # Link to the connections index page since individual connection pages 
might not exist
+        # Resolve per-connection docs URLs from Sphinx inventory when available
         connection_types = []
-        connections_index_url = 
AIRFLOW_PROVIDER_CONNECTIONS_URL.format(package_name=package_name)
+        base_docs_url = 
AIRFLOW_PROVIDER_DOCS_URL.format(package_name=package_name).rstrip("/")
+        conn_url_map: dict[str, str] = {}
+        inv_path = fetch_provider_inventory(package_name)
+        if inv_path:
+            conn_url_map = read_connection_urls(inv_path)
         for conn in provider_yaml.get("connection-types", []):
             conn_type = conn.get("connection-type", "")
             hook_class = conn.get("hook-class-name", "")
@@ -506,7 +555,7 @@ def main():
                     {
                         "conn_type": conn_type,
                         "hook_class": hook_class,
-                        "docs_url": connections_index_url,
+                        "docs_url": resolve_connection_docs_url(conn_type, 
conn_url_map, base_docs_url),
                     }
                 )
 
diff --git a/dev/registry/extract_versions.py b/dev/registry/extract_versions.py
index d52a31b5bf7..38257f19070 100644
--- a/dev/registry/extract_versions.py
+++ b/dev/registry/extract_versions.py
@@ -54,6 +54,7 @@ except ImportError:
     print("ERROR: PyYAML required. Install with: pip install pyyaml")
     sys.exit(1)
 
+from extract_metadata import fetch_provider_inventory, read_connection_urls, 
resolve_connection_docs_url
 from registry_tools.types import MODULE_LEVEL_SECTIONS, TYPE_SUFFIXES
 
 AIRFLOW_ROOT = Path(__file__).parent.parent.parent
@@ -366,13 +367,21 @@ def extract_version_data(
     if layout == "old" and not pyproject_data["dependencies"]:
         pyproject_data["dependencies"] = provider_yaml.get("dependencies", [])
 
-    # Connection types
+    # Connection types — resolve per-conn_type docs URLs from Sphinx inventory
+    package_name = provider_yaml.get("package-name", 
f"apache-airflow-providers-{provider_id}")
+    base_docs_url = f"https://airflow.apache.org/docs/{package_name}/stable";
+    conn_url_map: dict[str, str] = {}
+    inv_path = fetch_provider_inventory(package_name)
+    if inv_path:
+        conn_url_map = read_connection_urls(inv_path)
     connection_types = []
     for ct in provider_yaml.get("connection-types", []):
+        conn_type = ct.get("connection-type", "")
         connection_types.append(
             {
-                "conn_type": ct.get("connection-type", ""),
+                "conn_type": conn_type,
                 "hook_class": ct.get("hook-class-name", ""),
+                "docs_url": resolve_connection_docs_url(conn_type, 
conn_url_map, base_docs_url),
             }
         )
 
diff --git a/dev/registry/tests/test_extract_metadata.py 
b/dev/registry/tests/test_extract_metadata.py
index 138b41daae4..5bbccd7cb9b 100644
--- a/dev/registry/tests/test_extract_metadata.py
+++ b/dev/registry/tests/test_extract_metadata.py
@@ -34,7 +34,9 @@ from extract_metadata import (
     find_related_providers,
     module_path_to_file_path,
     parse_pyproject_toml,
+    read_connection_urls,
     read_inventory,
+    resolve_connection_docs_url,
 )
 
 
@@ -419,3 +421,150 @@ class TestFetchProviderInventory:
         result = fetch_provider_inventory("apache-airflow-providers-amazon", 
cache_dir=cache_dir)
         assert result is not None
         assert result.read_bytes() == new_content
+
+
+# ---------------------------------------------------------------------------
+# read_connection_urls
+# ---------------------------------------------------------------------------
+class TestReadConnectionUrls:
+    @staticmethod
+    def _make_inventory(tmp_path: Path, entries: list[str]) -> Path:
+        import zlib
+
+        inv_path = tmp_path / "objects.inv"
+        header = (
+            b"# Sphinx inventory version 2\n"
+            b"# Project: test\n"
+            b"# Version: 1.0\n"
+            b"# The remainder of this file is compressed using zlib.\n"
+        )
+        body = "\n".join(entries).encode("utf-8")
+        with inv_path.open("wb") as f:
+            f.write(header)
+            f.write(zlib.compress(body))
+        return inv_path
+
+    def test_parses_std_label_entries(self, tmp_path):
+        inv_path = self._make_inventory(
+            tmp_path,
+            [
+                "howto/connection:kubernetes std:label -1 
connections/kubernetes.html#howto-connection-kubernetes Kubernetes cluster 
Connection",
+            ],
+        )
+        result = read_connection_urls(inv_path)
+        assert result == {"kubernetes": "connections/kubernetes.html"}
+
+    def test_parses_std_doc_entries(self, tmp_path):
+        inv_path = self._make_inventory(
+            tmp_path,
+            [
+                "connections/tableau std:doc -1 connections/tableau.html 
Tableau Connection",
+            ],
+        )
+        result = read_connection_urls(inv_path)
+        assert result == {"tableau": "connections/tableau.html"}
+
+    def test_label_takes_precedence_over_doc(self, tmp_path):
+        """When both std:label and std:doc exist for the same key, label 
wins."""
+        inv_path = self._make_inventory(
+            tmp_path,
+            [
+                "howto/connection:aws std:label -1 
connections/aws.html#howto-connection-aws AWS Connection",
+                "connections/aws std:doc -1 connections/aws.html AWS 
Connection",
+            ],
+        )
+        result = read_connection_urls(inv_path)
+        assert result["aws"] == "connections/aws.html"
+
+    def test_skips_sub_section_labels(self, tmp_path):
+        """Labels like howto/connection:gcp:configuring_the_connection are 
sub-sections, not top-level."""
+        inv_path = self._make_inventory(
+            tmp_path,
+            [
+                "howto/connection:gcp std:label -1 
connections/gcp.html#howto-connection-gcp GCP Connection",
+                "howto/connection:gcp:configuring_the_connection std:label -1 
connections/gcp.html#sub Configuring",
+            ],
+        )
+        result = read_connection_urls(inv_path)
+        assert result == {"gcp": "connections/gcp.html"}
+
+    def test_skips_connections_index(self, tmp_path):
+        """The connections/index doc should not appear in the map."""
+        inv_path = self._make_inventory(
+            tmp_path,
+            [
+                "connections/index std:doc -1 connections/index.html 
Connection Types",
+                "connections/kafka std:doc -1 connections/kafka.html Kafka 
Connection",
+            ],
+        )
+        result = read_connection_urls(inv_path)
+        assert "index" not in result
+        assert result == {"kafka": "connections/kafka.html"}
+
+    def test_ignores_unrelated_entries(self, tmp_path):
+        inv_path = self._make_inventory(
+            tmp_path,
+            [
+                "airflow.providers.amazon.hooks.s3.S3Hook py:class 1 
api.html#$ -",
+                "some_module py:module 1 mod.html -",
+            ],
+        )
+        result = read_connection_urls(inv_path)
+        assert result == {}
+
+    def test_empty_inventory(self, tmp_path):
+        inv_path = self._make_inventory(tmp_path, [])
+        result = read_connection_urls(inv_path)
+        assert result == {}
+
+    def test_multiple_connection_types(self, tmp_path):
+        """Amazon-style provider with multiple connection pages."""
+        inv_path = self._make_inventory(
+            tmp_path,
+            [
+                "howto/connection:aws std:label -1 
connections/aws.html#howto-connection-aws AWS",
+                "howto/connection:emr std:label -1 
connections/emr.html#howto-connection-emr EMR",
+                "howto/connection:redshift std:label -1 
connections/redshift.html#howto-connection-redshift Redshift",
+                "connections/athena std:doc -1 connections/athena.html Athena",
+            ],
+        )
+        result = read_connection_urls(inv_path)
+        assert result["aws"] == "connections/aws.html"
+        assert result["emr"] == "connections/emr.html"
+        assert result["redshift"] == "connections/redshift.html"
+        assert result["athena"] == "connections/athena.html"
+
+
+# ---------------------------------------------------------------------------
+# resolve_connection_docs_url
+# ---------------------------------------------------------------------------
+class TestResolveConnectionDocsUrl:
+    BASE = 
"https://airflow.apache.org/docs/apache-airflow-providers-google/stable";
+
+    def test_exact_match(self):
+        conn_map = {"kubernetes": "connections/kubernetes.html"}
+        url = resolve_connection_docs_url("kubernetes", conn_map, self.BASE)
+        assert url == f"{self.BASE}/connections/kubernetes.html"
+
+    def test_fallback_to_connections_dir(self):
+        conn_map = {"kubernetes": "connections/kubernetes.html"}
+        url = resolve_connection_docs_url("unknown_type", conn_map, self.BASE)
+        assert url == f"{self.BASE}/connections/"
+
+    def test_empty_map_falls_back_to_connections_dir(self):
+        url = resolve_connection_docs_url("aws", {}, self.BASE)
+        assert url == f"{self.BASE}/connections/"
+
+    def test_google_bigquery_resolves(self):
+        """gcpbigquery conn_type should resolve to bigquery.html, not index."""
+        conn_map = {
+            "gcp": "connections/gcp.html",
+            "gcpbigquery": "connections/bigquery.html",
+        }
+        url = resolve_connection_docs_url("gcpbigquery", conn_map, self.BASE)
+        assert url == f"{self.BASE}/connections/bigquery.html"
+
+    def test_tableau_resolves(self):
+        conn_map = {"tableau": "connections/tableau.html"}
+        url = resolve_connection_docs_url("tableau", conn_map, self.BASE)
+        assert url == f"{self.BASE}/connections/tableau.html"
diff --git a/registry/src/js/connection-builder.js 
b/registry/src/js/connection-builder.js
index a316a99cb56..3228201f622 100644
--- a/registry/src/js/connection-builder.js
+++ b/registry/src/js/connection-builder.js
@@ -68,12 +68,11 @@
       // Set title
       titleEl.textContent = connType;
 
-      // Show/hide docs link (derive per-connection-type URL)
+      // Show/hide docs link (URL is resolved per-connection-type by extract 
scripts)
       if (docsLink) {
-        var baseDocsUrl = chip.dataset.docsUrl;
-        if (baseDocsUrl) {
-          var perTypeUrl = baseDocsUrl.replace(/index\.html$/, connType + 
".html");
-          docsLink.href = perTypeUrl;
+        var docsUrl = chip.dataset.docsUrl;
+        if (docsUrl) {
+          docsLink.href = docsUrl;
           docsLink.hidden = false;
         } else {
           docsLink.hidden = true;

(airflow) branch main updated: Fix connection docs URLs to use Sphinx inventory instead of hardcoded paths (#63349)

Reply via email to