(airflow) branch main updated: Add rookie author filtering and improved GitHub token handling (#60641)

potiuk Sat, 17 Jan 2026 01:54:16 -0800

This is an automated email from the ASF dual-hosted git repository.

potiuk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git



The following commit(s) were added to refs/heads/main by this push:
     new a8d9feb6683 Add rookie author filtering and improved GitHub token 
handling (#60641)
a8d9feb6683 is described below

commit a8d9feb66837a6ff1eefb3dfe6922aa2af88ef63
Author: Jarek Potiuk <[email protected]>
AuthorDate: Sat Jan 17 10:52:48 2026 +0100

    Add rookie author filtering and improved GitHub token handling (#60641)
    
    - Introduced filtering for PRs by rookie authors (less than 5 PRs, first
      PR within 2 months).
    - Enhanced GitHub token handling by supporting retrieval via `gh` CLI
      and environment variable fallback.
    - Added detailed author display with rookie indicators.
    - Implemented backend optimizations for author checks using caching and
      GraphQL.
---
 dev/stats/get_important_pr_candidates.py | 380 ++++++++++++++++++++++++++++---
 1 file changed, 348 insertions(+), 32 deletions(-)

diff --git a/dev/stats/get_important_pr_candidates.py 
b/dev/stats/get_important_pr_candidates.py
index b85b27a3bc6..eb0fce198c1 100755
--- a/dev/stats/get_important_pr_candidates.py
+++ b/dev/stats/get_important_pr_candidates.py
@@ -15,6 +15,16 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# /// script
+# requires-python = ">=3.9"
+# dependencies = [
+#     "pendulum>=3.0.0",
+#     "requests>=2.31.0",
+#     "rich-click>=1.7.0",
+#     "PyGithub>=2.1.1",
+#     "rich>=13.7.0",
+# ]
+# ///
 from __future__ import annotations
 
 import heapq
@@ -24,6 +34,7 @@ import math
 import os
 import pickle
 import re
+import subprocess
 import textwrap
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
@@ -38,17 +49,58 @@ from rich.console import Console
 logger = logging.getLogger(__name__)
 console = Console(width=400, color_system="standard")
 
+
+def get_github_token_from_gh_cli() -> str | None:
+    """Retrieve GitHub token from gh CLI tool."""
+    try:
+        result = subprocess.run(
+            ["gh", "auth", "token"],
+            capture_output=True,
+            text=True,
+            check=True,
+            timeout=5,
+        )
+        token = result.stdout.strip()
+        if token:
+            return token
+    except (subprocess.CalledProcessError, FileNotFoundError, 
subprocess.TimeoutExpired):
+        pass
+    return None
+
+
+def get_github_token(token: str | None) -> str:
+    """Get GitHub token from parameter, env var, or gh CLI."""
+    if token:
+        return token
+
+    # Try environment variable
+    env_token = os.environ.get("GITHUB_TOKEN")
+    if env_token:
+        return env_token
+
+    # Try gh CLI
+    gh_token = get_github_token_from_gh_cli()
+    if gh_token:
+        console.print("[blue]Using GitHub token from gh CLI[/]")
+        return gh_token
+
+    raise click.ClickException(
+        "GitHub token is required. Provide via --github-token, GITHUB_TOKEN 
env variable, "
+        "or authenticate with 'gh auth login'"
+    )
+
+
 option_github_token = click.option(
     "--github-token",
     type=str,
-    required=True,
+    required=False,
     help=textwrap.dedent(
         """
-        A GitHub token is required, and can also be provided by setting the 
GITHUB_TOKEN env variable.
+        A GitHub token is required, and can also be provided by setting the 
GITHUB_TOKEN env variable
+        or retrieved automatically from 'gh' CLI if authenticated.
         Can be generated with:
         
https://github.com/settings/tokens/new?description=Read%20issues&scopes=repo:status""";
     ),
-    envvar="GITHUB_TOKEN",
 )
 
 
@@ -61,11 +113,11 @@ class PRFetcher:
         }
         self.base_url = "https://api.github.com/graphql";
 
-    def fetch_prs_bulk(self, pr_numbers: list[int]) -> list[dict]:
+    def fetch_prs_bulk(self, pr_numbers: list[int], batch_size: int = 10) -> 
list[dict]:
         """Fetch multiple PRs with COMPLETE data and proper pagination."""
 
         pr_queries = []
-        for i, pr_num in enumerate(pr_numbers[:10]):
+        for i, pr_num in enumerate(pr_numbers[:batch_size]):
             pr_queries.append(f"""
             pr{i}: pullRequest(number: {pr_num}) {{
                 number
@@ -74,7 +126,12 @@ class PRFetcher:
                 createdAt
                 mergedAt
                 url
-                author {{ login }}
+                author {{
+                    login
+                    ... on User {{
+                        name
+                    }}
+                }}
                 additions
                 deletions
                 changedFiles
@@ -176,7 +233,38 @@ class PRFetcher:
 
             data = response.json()
             if "errors" in data:
-                logger.error("GraphQL errors: %s", {data["errors"]})
+                errors = data["errors"]
+
+                # Check for RESOURCE_LIMIT(S)_EXCEEDED error - handle silently 
(before logging)
+                for error in errors:
+                    error_type = error.get("type", "")
+                    error_message = error.get("message", "")
+
+                    # Check for both singular and plural variants
+                    if (
+                        error_type in ("RESOURCE_LIMIT_EXCEEDED", 
"RESOURCE_LIMITS_EXCEEDED")
+                        or "RESOURCE_LIMIT" in error_type
+                        or "RESOURCE_LIMIT" in error_message
+                    ):
+                        # Resource limit exceeded - retry with smaller batch 
(silently)
+                        if batch_size > 1:
+                            new_batch_size = max(1, batch_size // 2)
+                            console.print(
+                                f"[yellow]⚠ GraphQL resource limit exceeded - "
+                                f"reducing batch size from {batch_size} to 
{new_batch_size}[/]"
+                            )
+                            # Recursively retry with smaller batch
+                            return self.fetch_prs_bulk(pr_numbers, 
batch_size=new_batch_size)
+                        console.print(
+                            f"[red]❌ Resource limit exceeded even with batch 
size 1, "
+                            f"skipping PR {pr_numbers[0] if pr_numbers else 
'unknown'}[/]"
+                        )
+                        return []
+
+                # Only log non-resource-limit errors
+                # (if we got here, it's not a resource limit error)
+                logger.error("GraphQL errors: %s", errors)
+
                 if "data" not in data:
                     return []
 
@@ -189,7 +277,7 @@ class PRFetcher:
             return prs
 
         except Exception as e:
-            logger.error("GraphQL request exception: %s", {e})
+            logger.error("GraphQL request exception: %s", e)
             return []
 
     def fetch_linked_issues(self, pr_body: str, github_client: Github) -> dict:
@@ -250,6 +338,7 @@ class PrStat:
         self.body = pr_data.get("body", "") or ""
         self.url = pr_data["url"]
         self.author = pr_data["author"]["login"] if pr_data.get("author") else 
"unknown"
+        self.author_name = pr_data.get("author", {}).get("name") if 
pr_data.get("author") else None
         self.merged_at = pr_data.get("mergedAt")
         self.created_at = pr_data.get("createdAt")
 
@@ -278,8 +367,9 @@ class PrStat:
         comments_nodes = comments_data.get("nodes", [])
 
         for comment in comments_nodes:
-            if comment.get("author", {}).get("login"):
-                self._users.add(comment["author"]["login"])
+            author = comment.get("author") or {}
+            if author.get("login"):
+                self._users.add(author["login"])
 
             comment_body = comment.get("body", "") or ""
             if "protm" in comment_body.lower():
@@ -288,11 +378,12 @@ class PrStat:
             self.num_comments += 1
             self.len_comments += len(comment_body)
 
-            reactions = comment.get("reactions", {})
+            reactions = comment.get("reactions", {}) or {}
             reaction_nodes = reactions.get("nodes", [])
             for reaction in reaction_nodes:
-                if reaction.get("user", {}).get("login"):
-                    self._users.add(reaction["user"]["login"])
+                user = reaction.get("user") or {}
+                if user.get("login"):
+                    self._users.add(user["login"])
                 self.comment_reactions += 1
 
     def calc_conv_comments(self):
@@ -301,8 +392,9 @@ class PrStat:
         timeline_nodes = timeline_data.get("nodes", [])
 
         for item in timeline_nodes:
-            if item.get("author", {}).get("login"):
-                self._users.add(item["author"]["login"])
+            author = item.get("author") or {}
+            if author.get("login"):
+                self._users.add(author["login"])
 
             comment_body = item.get("body", "") or ""
             if "protm" in comment_body.lower():
@@ -311,11 +403,12 @@ class PrStat:
             self.num_conv_comments += 1
             self.len_issue_comments += len(comment_body)
 
-            reactions = item.get("reactions", {})
+            reactions = item.get("reactions", {}) or {}
             reaction_nodes = reactions.get("nodes", [])
             for reaction in reaction_nodes:
-                if reaction.get("user", {}).get("login"):
-                    self._users.add(reaction["user"]["login"])
+                user = reaction.get("user") or {}
+                if user.get("login"):
+                    self._users.add(user["login"])
                 self.conv_comment_reactions += 1
 
     def calc_review_comments(self):
@@ -328,8 +421,9 @@ class PrStat:
             comment_nodes = comments.get("nodes", [])
 
             for comment in comment_nodes:
-                if comment.get("author", {}).get("login"):
-                    self._users.add(comment["author"]["login"])
+                author = comment.get("author") or {}
+                if author.get("login"):
+                    self._users.add(author["login"])
 
                 comment_body = comment.get("body", "") or ""
                 if "protm" in comment_body.lower():
@@ -337,11 +431,12 @@ class PrStat:
 
                 self.len_comments += len(comment_body)
 
-                reactions = comment.get("reactions", {})
+                reactions = comment.get("reactions", {}) or {}
                 reaction_nodes = reactions.get("nodes", [])
                 for reaction in reaction_nodes:
-                    if reaction.get("user", {}).get("login"):
-                        self._users.add(reaction["user"]["login"])
+                    user = reaction.get("user") or {}
+                    if user.get("login"):
+                        self._users.add(user["login"])
                     self.comment_reactions += 1
 
     def calc_reviews(self):
@@ -350,8 +445,9 @@ class PrStat:
         review_nodes = reviews_data.get("nodes", [])
 
         for review in review_nodes:
-            if review.get("author", {}).get("login"):
-                self._users.add(review["author"]["login"])
+            author = review.get("author") or {}
+            if author.get("login"):
+                self._users.add(author["login"])
 
             review_body = review.get("body", "") or ""
             if "protm" in review_body.lower():
@@ -363,8 +459,9 @@ class PrStat:
         reaction_nodes = reactions_data.get("nodes", [])
 
         for reaction in reaction_nodes:
-            if reaction.get("user", {}).get("login"):
-                self._users.add(reaction["user"]["login"])
+            user = reaction.get("user") or {}
+            if user.get("login"):
+                self._users.add(user["login"])
 
     def calc_issue_reactions(self):
         """Process linked issue data."""
@@ -501,6 +598,12 @@ class PrStat:
         )
         return self._score
 
+    def get_author_display(self) -> str:
+        """Format author information as 'by @github_id (Name)' or just 'by 
@github_id' if name not available."""
+        if self.author_name:
+            return f"by @{self.author} ({self.author_name})"
+        return f"by @{self.author}"
+
     def __str__(self) -> str:
         self.process_all_data()
         prefix = "[magenta]##Tagged PR## [/]" if self.tagged_protm else ""
@@ -518,6 +621,137 @@ class SuperFastPRFinder:
         self.github_token = github_token
         self.github_client = Github(github_token)
         self.graphql_fetcher = PRFetcher(github_token)
+        self.author_cache: dict[str, dict] = {}
+
+    def is_rookie_author(self, author: str) -> bool:
+        """Check if an author is a rookie (less than 5 PRs, first PR within 2 
months)."""
+        if author in self.author_cache:
+            return self.author_cache[author]["is_rookie"]
+
+        # Skip bot accounts - they're not rookies
+        bot_accounts = [
+            "dependabot",
+            "dependabot[bot]",
+            "github-actions",
+            "github-actions[bot]",
+            "pre-commit-ci",
+            "pre-commit-ci[bot]",
+        ]
+        if author.lower() in [b.lower() for b in bot_accounts] or 
author.endswith("[bot]"):
+            self.author_cache[author] = {"is_rookie": False, "total_prs": -1, 
"reason": "bot account"}
+            return False
+
+        try:
+            # Use GraphQL to fetch author's PR count and first PR date - more 
efficient
+            query = """
+            query($author: String!) {
+                search(query: $author, type: ISSUE, first: 5) {
+                    issueCount
+                    edges {
+                        node {
+                            ... on PullRequest {
+                                number
+                                mergedAt
+                                closedAt
+                            }
+                        }
+                    }
+                }
+            }
+            """
+
+            search_query = f"repo:apache/airflow type:pr author:{author} 
is:merged sort:created-asc"
+            variables = {"author": search_query}
+
+            response = requests.post(
+                "https://api.github.com/graphql";,
+                json={"query": query, "variables": variables},
+                headers=self.graphql_fetcher.headers,
+                timeout=10,
+            )
+
+            if response.status_code == 403:
+                # Rate limit or auth issue - mark as non-rookie to continue
+                console.print(f"[dim yellow]⚠ API rate limit for {author}, 
skipping rookie check[/]")
+                self.author_cache[author] = {"is_rookie": False, "error": 
"rate_limit"}
+                return False
+
+            if response.status_code != 200:
+                console.print(f"[dim yellow]⚠ API error {response.status_code} 
for {author}[/]")
+                self.author_cache[author] = {"is_rookie": False, "error": 
f"http_{response.status_code}"}
+                return False
+
+            data = response.json()
+
+            if "errors" in data:
+                errors = data["errors"]
+
+                # Check for RESOURCE_LIMIT(S)_EXCEEDED error - both singular 
and plural
+                for error in errors:
+                    error_type = error.get("type", "")
+                    error_message = error.get("message", "")
+
+                    if (
+                        error_type in ("RESOURCE_LIMIT_EXCEEDED", 
"RESOURCE_LIMITS_EXCEEDED")
+                        or "RESOURCE_LIMIT" in error_type
+                        or "RESOURCE_LIMIT" in error_message
+                    ):
+                        console.print(
+                            f"[dim yellow]⚠ Resource limit exceeded for author 
{author}, "
+                            f"marking as non-rookie to continue[/]"
+                        )
+                        self.author_cache[author] = {"is_rookie": False, 
"error": "resource_limit"}
+                        return False
+
+                # Other GraphQL errors
+                self.author_cache[author] = {"is_rookie": False, "error": 
"graphql_error"}
+                return False
+
+            search_data = data.get("data", {}).get("search", {})
+            total_prs = search_data.get("issueCount", 0)
+
+            if total_prs >= 5:
+                self.author_cache[author] = {"is_rookie": False, "total_prs": 
total_prs}
+                return False
+
+            # Get the first PR
+            edges = search_data.get("edges", [])
+            if not edges:
+                self.author_cache[author] = {"is_rookie": False, "total_prs": 
0}
+                return False
+
+            first_pr_node = edges[0].get("node", {})
+            first_pr_merged_str = first_pr_node.get("mergedAt") or 
first_pr_node.get("closedAt")
+
+            if not first_pr_merged_str:
+                self.author_cache[author] = {"is_rookie": False, "total_prs": 
total_prs}
+                return False
+
+            # Parse ISO datetime string
+            parsed_date = pendulum.parse(first_pr_merged_str)
+            # Ensure we have a DateTime instance for comparison
+            if not isinstance(parsed_date, pendulum.DateTime):
+                self.author_cache[author] = {"is_rookie": False, "total_prs": 
total_prs}
+                return False
+
+            first_pr_merged = parsed_date
+            two_months_ago = pendulum.now().subtract(months=2)
+
+            is_rookie = first_pr_merged >= two_months_ago
+            self.author_cache[author] = {
+                "is_rookie": is_rookie,
+                "total_prs": total_prs,
+                "first_pr_date": first_pr_merged,
+            }
+            return is_rookie
+
+        except Exception as e:
+            # Silently handle errors - don't clutter output during bulk 
processing
+            error_msg = str(e)
+            if "403" not in error_msg and "Forbidden" not in error_msg and 
"rate" not in error_msg.lower():
+                console.print(f"[dim yellow]⚠ Error checking {author}: {e}[/]")
+            self.author_cache[author] = {"is_rookie": False, "error": str(e)}
+            return False
 
     def search_prs_with_filters(
         self, date_start: datetime, date_end: datetime, limit: int = 1000
@@ -681,6 +915,33 @@ class SuperFastPRFinder:
         console.print(f"[blue]Successfully processed {len(pr_stats)} PRs with 
complete data[/]")
         return pr_stats
 
+    def filter_rookie_prs(self, pr_stats: list[PrStat], max_workers: int = 4) 
-> list[PrStat]:
+        """Filter PRs to only include those from rookie authors."""
+        console.print("[blue]🆕 Filtering for rookie authors...[/]")
+
+        rookie_prs = []
+
+        # Check authors in parallel for better performance
+        def check_pr_author(pr_stat: PrStat) -> tuple[PrStat, bool]:
+            is_rookie = self.is_rookie_author(pr_stat.author)
+            return (pr_stat, is_rookie)
+
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = [executor.submit(check_pr_author, pr) for pr in pr_stats]
+
+            for future in as_completed(futures):
+                pr_stat, is_rookie = future.result()
+                if is_rookie:
+                    rookie_prs.append(pr_stat)
+                    author_info = self.author_cache.get(pr_stat.author, {})
+                    console.print(
+                        f"[green]✓ Rookie: @{pr_stat.author} - 
{author_info.get('total_prs', 0)} PRs, "
+                        f"first PR: {author_info.get('first_pr_date', 
'unknown')}[/]"
+                    )
+
+        console.print(f"[green]Found {len(rookie_prs)} PRs from rookie authors 
(out of {len(pr_stats)})[/]")
+        return rookie_prs
+
 
 DAYS_BACK = 5
 DEFAULT_BEGINNING_OF_MONTH = 
pendulum.now().subtract(days=DAYS_BACK).start_of("month")
@@ -704,6 +965,10 @@ DEFAULT_END_OF_MONTH = 
DEFAULT_BEGINNING_OF_MONTH.end_of("month").add(days=1)
 @click.option("--verbose", is_flag=True, help="Print detailed output")
 @click.option("--cache-search", type=click.Path(), help="Cache search results 
to file")
 @click.option("--load-search", type=click.Path(), help="Load search results 
from cache")
[email protected](
+    "--rookie", is_flag=True, help="Only consider PRs from rookie authors (<5 
PRs, first PR within 2 months)"
+)
[email protected]("--show-score", is_flag=True, help="Include score in the output")
 def main(
     github_token: str,
     date_start: datetime,
@@ -717,15 +982,29 @@ def main(
     verbose: bool,
     cache_search: str | None,
     load_search: str | None,
+    rookie: bool,
+    show_score: bool,
 ):
     """Super-fast PR finder with COMPLETE data capture and proper GraphQL 
pagination."""
 
+    github_token = get_github_token(github_token)
+
     console.print("[bold blue]🚀 Fixed Super-Fast PR Candidate Finder[/bold 
blue]")
     console.print(f"Date range: {date_start.date()} to {date_end.date()}")
+    if rookie:
+        console.print("[bold cyan]🆕 Rookie mode: Only considering PRs from new 
contributors[/bold cyan]")
+
+    # Initialize finder - needed for rookie mode or for general use
+    finder = None
 
     if load:
         console.print("[yellow]Loading from cache...[/]")
         pr_stats = pickle.load(load)
+
+        if rookie:
+            finder = SuperFastPRFinder(github_token)
+            pr_stats = finder.filter_rookie_prs(pr_stats, max_workers)
+
         scores = {pr.number: pr.score for pr in pr_stats}
 
     else:
@@ -755,9 +1034,14 @@ def main(
         console.print("[blue]🔥 Phase 3: Complete data fetching with proper 
pagination[/]")
         pr_stats = finder.fetch_full_pr_data(candidate_numbers, max_workers)
 
+        if rookie:
+            pr_stats = finder.filter_rookie_prs(pr_stats, max_workers)
+
         scores = {pr.number: pr.score for pr in pr_stats}
 
-    console.print(f"\n[bold green]🏆 Top {top_number} PRs:[/bold green]")
+    # Format date range for display
+    date_range_str = f"{date_start.strftime('%Y-%m-%d')} to 
{date_end.strftime('%Y-%m-%d')}"
+    console.print(f"\n[bold green]🏆 Top {top_number} PRs 
({date_range_str}):[/bold green]\n")
     top_final = heapq.nlargest(top_number, scores.items(), key=lambda x: x[1])
 
     for i, (pr_num, score) in enumerate(top_final, 1):
@@ -765,18 +1049,50 @@ def main(
         if pr_stat:
             pr_stat.process_all_data()
             protm_indicator = "🔥" if pr_stat.tagged_protm else ""
-            console.print(
-                f"[green]{i:2d}. {protm_indicator} Score: {score:.2f} - 
PR#{pr_num}: {pr_stat.title}[/]"
-            )
+
+            # Build the main PR line
+            if show_score:
+                console.print(
+                    f"[green]{i:2d}. {protm_indicator} Score: {score:.2f} - 
PR#{pr_num}: {pr_stat.title}[/]"
+                )
+            else:
+                console.print(f"[green]{i:2d}. {protm_indicator} PR#{pr_num}: 
{pr_stat.title}[/]")
+
+            # Show author information
+            author_display = pr_stat.get_author_display()
+
+            # Add rookie information if in rookie mode and we have the data
+            if rookie and finder is not None and pr_stat.author in 
finder.author_cache:
+                author_info = finder.author_cache[pr_stat.author]
+                if author_info.get("is_rookie"):
+                    total_prs = author_info.get("total_prs", 0)
+                    first_pr_date = author_info.get("first_pr_date")
+                    if first_pr_date:
+                        # Format the date nicely
+                        if isinstance(first_pr_date, str):
+                            date_str = first_pr_date
+                        else:
+                            date_str = (
+                                first_pr_date.strftime("%Y-%m-%d")
+                                if hasattr(first_pr_date, "strftime")
+                                else str(first_pr_date)
+                            )
+                        author_display += f" [cyan]🆕 Rookie: PR #{total_prs}, 
first merged {date_str}[/cyan]"
+
+            console.print(f"     {author_display}")
             console.print(f"     [dim]{pr_stat.url}[/dim]")
+
             if verbose:
                 console.print(
-                    f"     [dim]Author: {pr_stat.author}, Files: 
{pr_stat.changed_files}, "
+                    f"     [dim]Files: {pr_stat.changed_files}, "
                     f"+{pr_stat.additions}/-{pr_stat.deletions}, Comments: 
{pr_stat.num_comments + pr_stat.num_conv_comments}[/dim]"
                 )
                 if pr_stat.tagged_protm:
                     console.print("     [magenta]🔥 CONTAINS #PROTM 
TAG[/magenta]")
 
+            # Add empty line between PRs for better readability
+            console.print()
+
     if save:
         console.print("[blue]💾 Saving complete results...[/]")
         pickle.dump(pr_stats, save)

(airflow) branch main updated: Add rookie author filtering and improved GitHub token handling (#60641)

Reply via email to