(airflow) branch main updated: Add script that verifies consistency: Release Page with Release Calendar (#59142)

potiuk Sun, 07 Dec 2025 07:39:01 -0800

This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git



The following commit(s) were added to refs/heads/main by this push:
     new f0d0b2fb993 Add script that verifies consistency: Release Page with 
Release Calendar (#59142)
f0d0b2fb993 is described below

commit f0d0b2fb993aadb8b56266f37324e275703a88a6
Author: Jarek Potiuk <[email protected]>
AuthorDate: Sun Dec 7 16:38:47 2025 +0100

    Add script that verifies consistency: Release Page with Release Calendar 
(#59142)
    
    We maintain release calendar with release page in Confluence in parallel.
    
    This script (AI generated) verifies if what is in Release page is correctly
    reflected in the calendar.
---
 .github/workflows/ci-amd-arm.yml |  17 ++
 dev/pyproject.toml               |   3 +
 dev/verify_release_calendar.py   | 600 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 620 insertions(+)

diff --git a/.github/workflows/ci-amd-arm.yml b/.github/workflows/ci-amd-arm.yml
index 3987e8ab258..1ab45e39154 100644
--- a/.github/workflows/ci-amd-arm.yml
+++ b/.github/workflows/ci-amd-arm.yml
@@ -231,6 +231,23 @@ jobs:
       platform: ${{ needs.build-info.outputs.platform }}
       shared-distributions-as-json: 
${{needs.build-info.outputs.shared-distributions-as-json}}
 
+  verify-release-calendar:
+    name: "Verify release calendar"
+    runs-on: ${{ fromJSON(needs.build-info.outputs.runner-type) }}
+    needs: [build-info]
+    # Only run on canary builds (push to main, scheduled runs, or manual 
dispatch)
+    if: needs.build-info.outputs.canary-run == 'true'
+    timeout-minutes: 10
+    steps:
+      - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683  # 
v4.2.2
+        with:
+          persist-credentials: false
+      - name: "Install uv"
+        run: curl -LsSf https://astral.sh/uv/install.sh | sh
+      - name: "Verify release calendar"
+        run: uv run dev/verify_release_calendar.py
+
   build-ci-images:
     name: Build CI images
     needs: [build-info]
diff --git a/dev/pyproject.toml b/dev/pyproject.toml
index e3a9722c33c..9f69f4b2144 100644
--- a/dev/pyproject.toml
+++ b/dev/pyproject.toml
@@ -36,7 +36,9 @@ maintainers = [
 version = "0.0.1"
 
 dependencies = [
+    "beautifulsoup4>=4.12.0",
     "click>=8.1.8",
+    "icalendar>=5.0.0",
     "jinja2>=3.1.5",
     "keyring==25.6.0",
     "PyGithub>=2.1.1",
@@ -45,6 +47,7 @@ dependencies = [
     'pendulum>=3.1.0',
     "pyyaml>=6.0.3",
     "packaging>=25.0",
+    "requests>=2.31.0",
     "rich>=13.6.0",
     "rich-click>=1.7.1",
     "semver>=3.0.2",
diff --git a/dev/verify_release_calendar.py b/dev/verify_release_calendar.py
new file mode 100755
index 00000000000..29dd05494ad
--- /dev/null
+++ b/dev/verify_release_calendar.py
@@ -0,0 +1,600 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# /// script
+# requires-python = ">=3.8"
+# dependencies = [
+#     "requests",
+#     "beautifulsoup4",
+#     "icalendar",
+#     "rich",
+# ]
+# ///
+"""
+Verify that releases planned in Confluence wiki have matching Google Calendar 
entries.
+
+This script fetches the release plan from the Confluence wiki page and 
compares it
+with the Google Calendar entries to ensure they match.
+
+Release Plan: https://cwiki.apache.org/confluence/display/AIRFLOW/Release+Plan
+Calendar iCal: 
https://calendar.google.com/calendar/ical/c_de214e92df3b759779cb65f3e49e562796c6126e7500cfa7e524bf78186d8b5e%40group.calendar.google.com/public/basic.ics
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import requests
+from bs4 import BeautifulSoup
+from icalendar import Calendar
+from rich.console import Console
+
+# Configure console
+console = Console()
+
+# Constants
+CONFLUENCE_URL = 
"https://cwiki.apache.org/confluence/display/AIRFLOW/Release+Plan";
+CALENDAR_ICAL_URL = (
+    "https://calendar.google.com/calendar/ical/";
+    
"c_de214e92df3b759779cb65f3e49e562796c6126e7500cfa7e524bf78186d8b5e%40group.calendar.google.com/"
+    "public/basic.ics"
+)
+
+
+@dataclass
+class Release:
+    """Represents a planned release."""
+
+    release_type: str  # "Airflow Ctl" or "Providers"
+    version: str
+    date: datetime
+    release_manager: str
+
+    def __str__(self):
+        return f"{self.release_type} {self.version} on 
{self.date.strftime('%Y-%m-%d')} by {self.release_manager}"
+
+
+@dataclass
+class CalendarEntry:
+    """Represents a calendar entry."""
+
+    summary: str
+    start_date: datetime
+    description: str | None = None
+
+    def __str__(self):
+        return f"{self.summary} on {self.start_date.strftime('%Y-%m-%d')}"
+
+
+def fetch_confluence_page() -> str:
+    """Fetch the Confluence release plan page."""
+    console.print(f"[cyan]Fetching Confluence page:[/cyan] {CONFLUENCE_URL}")
+    try:
+        response = requests.get(CONFLUENCE_URL, timeout=30)
+        response.raise_for_status()
+        return response.text
+    except requests.RequestException as e:
+        console.print(f"[red]Failed to fetch Confluence page:[/red] {e}")
+        sys.exit(1)
+
+
+def print_confluence_debug_info(soup: BeautifulSoup) -> None:
+    """Print debug information about the Confluence page structure."""
+    all_headings = soup.find_all(["h1", "h2", "h3", "h4", "h5"])
+    console.print(f"[dim]Found {len(all_headings)} headings in page[/dim]")
+    if all_headings:
+        console.print("[dim]First 10 headings:[/dim]")
+        for heading in all_headings[:10]:
+            console.print(f"  [dim]{heading.name}: 
{heading.get_text(strip=True)[:80]}[/dim]")
+
+    all_tables = soup.find_all("table")
+    console.print(f"[dim]Found {len(all_tables)} tables in page[/dim]")
+
+
+def get_release_sections() -> dict[str, list[str]]:
+    """Return the mapping of release types to their possible section names."""
+    return {
+        "Airflow Ctl": ["Airflow Ctl", "airflow-ctl", "airflow ctl"],
+        "Providers": [
+            "Support for Airflow in Providers",
+            "Provider Releases",
+            "Providers",
+            "Provider",
+            "provider release",
+        ],
+    }
+
+
+def find_table_for_heading(heading: Any) -> Any | None:
+    """Find the table associated with a heading."""
+    # Try to find table as sibling first
+    current = heading.find_next_sibling()
+    while current:
+        if current.name == "table":
+            console.print("  [dim]Found table directly after heading[/dim]")
+            return current
+        if current.name in ["h1", "h2", "h3", "h4", "h5"]:
+            # Stop if we hit another heading
+            break
+        current = current.find_next_sibling()
+
+    # If no table found as sibling, try finding next table in document
+    next_table = heading.find_next("table")
+    if next_table:
+        console.print("  [dim]Found table via find_next[/dim]")
+        return next_table
+
+    return None
+
+
+def find_section_and_parse(soup: BeautifulSoup, release_type: str, 
section_names: list[str]) -> list[Release]:
+    """Find a section by name and parse its table."""
+    headings = soup.find_all(["h1", "h2", "h3", "h4", "h5"])
+    for section_name in section_names:
+        for heading in headings:
+            heading_text = heading.get_text(strip=True)
+            if section_name.lower() in heading_text.lower():
+                console.print(f"[green]Found section:[/green] {heading_text}")
+                table = find_table_for_heading(heading)
+                if table is not None:
+                    return parse_table(table, release_type)
+                break
+    return []
+
+
+def parse_confluence_releases(html_content: str) -> list[Release]:
+    """Parse releases from Confluence HTML content."""
+    console.print("[cyan]Parsing Confluence releases...[/cyan]")
+    soup = BeautifulSoup(html_content, "html.parser")
+
+    print_confluence_debug_info(soup)
+
+    releases: list[Release] = []
+    release_sections = get_release_sections()
+
+    for release_type, section_names in release_sections.items():
+        section_releases = find_section_and_parse(soup, release_type, 
section_names)
+        if section_releases:
+            releases.extend(section_releases)
+        else:
+            console.print(f"[yellow]Could not find section for 
{release_type}[/yellow]")
+
+    console.print(f"[green]Found {len(releases)} releases in 
Confluence[/green]")
+    return releases
+
+
+def get_table_headers(rows: list[Any]) -> tuple[list[str], bool]:
+    """Extract and normalize table headers. Returns headers and whether table 
is valid."""
+    if len(rows) < 2:
+        console.print("  [yellow]Table has no data rows[/yellow]")
+        return [], False
+
+    header_cells = rows[0].find_all(["td", "th"])
+    headers = [cell.get_text(strip=True).lower() for cell in header_cells]
+    console.print(f"  [dim]Headers: {headers}[/dim]")
+    return headers, True
+
+
+def find_column_indices(headers: list[str]) -> tuple[int | None, int | None, 
int | None]:
+    """Find the indices of version, date, and manager columns."""
+    version_idx = None
+    date_idx = None
+    manager_idx = None
+
+    for idx, header in enumerate(headers):
+        if "version" in header and "suffix" not in header:
+            version_idx = idx
+        elif any(word in header for word in ["date", "cut date", "planned cut 
date"]):
+            date_idx = idx
+        elif any(word in header for word in ["manager", "release manager"]):
+            manager_idx = idx
+
+    console.print(
+        f"  [dim]Column mapping - version: {version_idx}, date: {date_idx}, 
manager: {manager_idx}[/dim]"
+    )
+    return version_idx, date_idx, manager_idx
+
+
+def parse_date_string(date_str: str) -> datetime | None:
+    """Parse a date string in various formats."""
+    date_formats = [
+        "%d %b %Y",  # 09 Dec 2025
+        "%d %B %Y",  # 09 December 2025
+        "%Y-%m-%d",  # 2025-12-06
+        "%Y/%m/%d",  # 2025/12/06
+        "%m/%d/%Y",  # 12/06/2025
+        "%d-%m-%Y",  # 06-12-2025
+        "%b %d, %Y",  # Dec 09, 2025
+        "%B %d, %Y",  # December 09, 2025
+    ]
+
+    # Handle "Week of DD Mon YYYY" format
+    clean_date_str = date_str
+    if "week of" in date_str.lower():
+        clean_date_str = date_str.lower().replace("week of", "").strip()
+
+    for date_format in date_formats:
+        try:
+            return datetime.strptime(clean_date_str, date_format)
+        except ValueError:
+            continue
+
+    console.print(
+        f"  [yellow]Could not parse date:[/yellow] '{date_str}' (tried 
{len(date_formats)} formats)"
+    )
+    return None
+
+
+def extract_manager_first_name(release_manager: str) -> str:
+    """Extract the first name from a release manager string."""
+    if "+" in release_manager:
+        return release_manager.split("+")[0].strip().split()[0]
+    return release_manager.split()[0] if release_manager else ""
+
+
+def generate_version_from_date(date: datetime) -> str:
+    """Generate a version string from a date for releases without explicit 
versions."""
+    return date.strftime("%Y.%m.%d")
+
+
+def parse_table_row(
+    cells: list[Any],
+    row_num: int,
+    version_idx: int | None,
+    date_idx: int | None,
+    manager_idx: int | None,
+    release_type: str,
+) -> Release | None:
+    """Parse a single table row into a Release object."""
+    try:
+        # Extract data from cells
+        date_str = cells[date_idx].get_text(strip=True) if date_idx is not 
None else ""
+        release_manager = cells[manager_idx].get_text(strip=True) if 
manager_idx is not None else ""
+        version = cells[version_idx].get_text(strip=True) if version_idx is 
not None else None
+
+        # Skip empty rows
+        if not date_str or not release_manager:
+            console.print(f"  [dim]Row {row_num}: Skipping empty row[/dim]")
+            return None
+
+        # Parse date
+        date = parse_date_string(date_str)
+        if not date:
+            return None
+
+        # Extract manager name
+        release_manager_first = extract_manager_first_name(release_manager)
+
+        # Generate version if needed
+        if version_idx is None or not version:
+            version = generate_version_from_date(date)
+
+        release = Release(
+            release_type=release_type,
+            version=version,
+            date=date,
+            release_manager=release_manager_first,
+        )
+        console.print(f"  [green]Parsed:[/green] {release}")
+        return release
+
+    except (IndexError, ValueError) as e:
+        console.print(f"[yellow]Error parsing row {row_num}:[/yellow] {e}")
+        return None
+
+
+def parse_table(table: Any, release_type: str) -> list[Release]:
+    """Parse a release table from HTML."""
+    releases: list[Release] = []
+    rows = table.find_all("tr")
+
+    console.print(f"  [dim]Table has {len(rows)} rows[/dim]")
+
+    # Get and validate headers
+    headers, is_valid = get_table_headers(rows)
+    if not is_valid:
+        return releases
+
+    # Find column indices
+    version_idx, date_idx, manager_idx = find_column_indices(headers)
+
+    if date_idx is None or manager_idx is None:
+        console.print("  [yellow]Could not find required columns (date and 
manager)[/yellow]")
+        return releases
+
+    # Parse data rows
+    for i, row in enumerate(rows[1:], start=1):
+        cells = row.find_all(["td", "th"])
+        if len(cells) < max(filter(None, [version_idx, date_idx, 
manager_idx])) + 1:
+            console.print(f"  [dim]Row {i}: Skipping (not enough cells)[/dim]")
+            continue
+
+        release = parse_table_row(cells, i, version_idx, date_idx, 
manager_idx, release_type)
+        if release:
+            releases.append(release)
+
+    return releases
+
+
+def parse_calendar_component(component: Any) -> CalendarEntry | None:
+    """Parse a calendar component into a CalendarEntry."""
+    if component.name != "VEVENT":
+        return None
+
+    summary = str(component.get("summary", ""))
+    dtstart = component.get("dtstart")
+    description = component.get("description", "")
+
+    if not dtstart:
+        return None
+
+    # Handle both date and datetime objects
+    if hasattr(dtstart.dt, "date"):
+        start_date = datetime.combine(dtstart.dt.date(), datetime.min.time())
+    elif isinstance(dtstart.dt, datetime):
+        start_date = dtstart.dt
+    else:
+        start_date = datetime.combine(dtstart.dt, datetime.min.time())
+
+    return CalendarEntry(
+        summary=summary,
+        start_date=start_date,
+        description=str(description) if description else None,
+    )
+
+
+def fetch_calendar_entries() -> list[CalendarEntry]:
+    """Fetch and parse calendar entries from iCal feed."""
+    console.print(f"[cyan]Fetching calendar:[/cyan] {CALENDAR_ICAL_URL}")
+    try:
+        response = requests.get(CALENDAR_ICAL_URL, timeout=30)
+        response.raise_for_status()
+        calendar_data = response.content
+    except requests.RequestException as e:
+        console.print(f"[red]Failed to fetch calendar:[/red] {e}")
+        sys.exit(1)
+
+    console.print("[cyan]Parsing calendar entries...[/cyan]")
+    calendar = Calendar.from_ical(calendar_data)
+    entries = []
+
+    for component in calendar.walk():
+        entry = parse_calendar_component(component)
+        if entry:
+            entries.append(entry)
+
+    console.print(f"[green]Found {len(entries)} calendar entries[/green]")
+    return entries
+
+
+def normalize_name(name: str) -> str:
+    """Normalize a name by removing accents and converting to lowercase."""
+    import unicodedata
+
+    # Normalize unicode characters (NFD = decompose, then filter out combining 
marks)
+    nfd = unicodedata.normalize("NFD", name)
+    # Remove combining characters (accents)
+    without_accents = "".join(char for char in nfd if 
unicodedata.category(char) != "Mn")
+    return without_accents.lower().strip()
+
+
+def dates_match(release_date: datetime, entry_date: datetime) -> bool:
+    """Check if two dates match (same year, month, and day)."""
+    return (
+        entry_date.year == release_date.year
+        and entry_date.month == release_date.month
+        and entry_date.day == release_date.day
+    )
+
+
+def check_release_type_match(release_type: str, summary: str) -> bool:
+    """Check if release type matches the calendar entry summary."""
+    normalized_summary = normalize_name(summary)
+    normalized_release_type = normalize_name(release_type)
+
+    # Check if release type is in the summary (case-insensitive, 
accent-insensitive)
+    if normalized_release_type in normalized_summary:
+        return True
+
+    # Handle "Airflow Ctl" vs "Airflow CTL" variations
+    if "airflow" in normalized_release_type and "ctl" in 
normalized_release_type:
+        return "airflow" in normalized_summary and "ctl" in normalized_summary
+
+    return False
+
+
+def check_version_match(version: str, summary: str) -> bool:
+    """Check if version appears in the calendar entry summary."""
+    return version in summary
+
+
+def check_manager_match(manager_name: str, summary: str) -> bool:
+    """Check if manager's name appears in the calendar entry summary."""
+    import re
+
+    normalized_manager = normalize_name(manager_name)
+    normalized_summary = normalize_name(summary)
+
+    # Check if manager name appears anywhere in summary
+    if normalized_manager in normalized_summary:
+        return True
+
+    # Check if the manager appears as a word (not just substring)
+    manager_pattern = r"\b" + re.escape(normalized_manager) + r"\b"
+    return bool(re.search(manager_pattern, normalized_summary))
+
+
+def is_matching_entry(release: Release, entry: CalendarEntry) -> bool:
+    """
+    Check if a calendar entry matches a release.
+
+    A match requires:
+    - Matching dates
+    - Matching release type OR version
+    - Matching release manager name
+    """
+    if not dates_match(release.date, entry.start_date):
+        return False
+
+    release_type_match = check_release_type_match(release.release_type, 
entry.summary)
+    version_match = check_version_match(release.version, entry.summary)
+    manager_match = check_manager_match(release.release_manager, entry.summary)
+
+    # Consider it a match if date + (type or version) + manager match
+    return (release_type_match or version_match) and manager_match
+
+
+def find_matching_entry(release: Release, calendar_entries: 
list[CalendarEntry]) -> CalendarEntry | None:
+    """Find a calendar entry that matches the given release, or None if not 
found."""
+    for entry in calendar_entries:
+        if is_matching_entry(release, entry):
+            return entry
+    return None
+
+
+def print_verification_header() -> None:
+    """Print the verification results header."""
+    console.print("\n" + "=" * 80)
+    console.print("[bold cyan]VERIFICATION RESULTS[/bold cyan]")
+    console.print("=" * 80 + "\n")
+
+
+def print_matched_release(release: Release, entry: CalendarEntry) -> None:
+    """Print information about a matched release."""
+    console.print(f"[green]✓ MATCHED:[/green] {release}")
+    console.print(f"  [dim]Calendar: {entry.summary}[/dim]")
+
+
+def print_unmatched_release(release: Release) -> None:
+    """Print information about an unmatched release."""
+    console.print(f"[red]✗ NOT MATCHED:[/red] {release}")
+
+
+def print_verification_summary(
+    total_releases: int, matched_count: int, unmatched_releases: list[Release]
+) -> None:
+    """Print the verification summary."""
+    console.print("\n" + "=" * 80)
+    console.print("[bold cyan]SUMMARY[/bold cyan]")
+    console.print("=" * 80)
+    console.print(f"Total releases in Confluence: {total_releases}")
+    console.print(f"Matched releases: [green]{matched_count}[/green]")
+    console.print(f"Unmatched releases: [red]{len(unmatched_releases)}[/red]")
+
+    if unmatched_releases:
+        console.print("\n[yellow]Unmatched releases:[/yellow]")
+        for release in unmatched_releases:
+            console.print(f"  [yellow]•[/yellow] {release}")
+
+    console.print("=" * 80 + "\n")
+
+
+def verify_releases(releases: list[Release], calendar_entries: 
list[CalendarEntry]) -> bool:
+    """Verify that all releases have matching calendar entries."""
+    print_verification_header()
+
+    all_matched = True
+    unmatched_releases: list[Release] = []
+    matched_count = 0
+
+    for release in releases:
+        matching_entry = find_matching_entry(release, calendar_entries)
+
+        if matching_entry:
+            print_matched_release(release, matching_entry)
+            matched_count += 1
+        else:
+            all_matched = False
+            unmatched_releases.append(release)
+            print_unmatched_release(release)
+
+    print_verification_summary(len(releases), matched_count, 
unmatched_releases)
+
+    return all_matched
+
+
+def load_html_content(args: argparse.Namespace) -> str:
+    """Load HTML content from file or fetch from Confluence."""
+    if args.load_html:
+        console.print(f"[cyan]Loading HTML from file:[/cyan] {args.load_html}")
+        return Path(args.load_html).read_text(encoding="utf-8")
+
+    html_content = fetch_confluence_page()
+    if args.save_html:
+        console.print(f"[cyan]Saving HTML to file:[/cyan] {args.save_html}")
+        Path(args.save_html).write_text(html_content, encoding="utf-8")
+    return html_content
+
+
+def validate_releases(releases: list[Release]) -> None:
+    """Validate that releases were found, exit if not."""
+    if not releases:
+        console.print("[red]No releases found in Confluence page![/red]")
+        sys.exit(1)
+
+
+def validate_calendar_entries(calendar_entries: list[CalendarEntry]) -> None:
+    """Validate that calendar entries were found, exit if not."""
+    if not calendar_entries:
+        console.print("[red]No calendar entries found![/red]")
+        sys.exit(1)
+
+
+def print_final_result(all_matched: bool) -> None:
+    """Print the final result and exit with appropriate code."""
+    if all_matched:
+        console.print("[bold green]✓ All releases have matching calendar 
entries![/bold green]")
+        sys.exit(0)
+    else:
+        console.print("[bold red]✗ Some releases do not have matching calendar 
entries![/bold red]")
+        sys.exit(1)
+
+
+def main():
+    """Main function."""
+    parser = argparse.ArgumentParser(
+        description="Verify that planned releases in Confluence match Google 
Calendar entries"
+    )
+    parser.add_argument(
+        "--save-html", metavar="FILE", help="Save the fetched Confluence HTML 
to a file for debugging"
+    )
+    parser.add_argument(
+        "--load-html", metavar="FILE", help="Load Confluence HTML from a file 
instead of fetching"
+    )
+    args = parser.parse_args()
+
+    # Fetch and parse data
+    html_content = load_html_content(args)
+    releases = parse_confluence_releases(html_content)
+    validate_releases(releases)
+
+    calendar_entries = fetch_calendar_entries()
+    validate_calendar_entries(calendar_entries)
+
+    # Verify and exit with appropriate code
+    all_matched = verify_releases(releases, calendar_entries)
+    print_final_result(all_matched)
+
+
+if __name__ == "__main__":
+    main()

(airflow) branch main updated: Add script that verifies consistency: Release Page with Release Calendar (#59142)

Reply via email to