[airflow] branch main updated: improve performance of pr script in dev/stats (#32735)

mobuchowski Mon, 24 Jul 2023 06:01:14 -0700

This is an automated email from the ASF dual-hosted git repository.

mobuchowski pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git



The following commit(s) were added to refs/heads/main by this push:
     new 6f75a8a917 improve performance of pr script in dev/stats (#32735)
6f75a8a917 is described below

commit 6f75a8a9172725f63b14b83b4b57c4a10a4286c8
Author: Michael Robinson <[email protected]>
AuthorDate: Mon Jul 24 09:00:57 2023 -0400

    improve performance of pr script in dev/stats (#32735)
    
    * add error handling in case of 404 errors to protm script
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * add error handling in case of missing score when using pickle file
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * reformat
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * add UknownObjectException class to error handling
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * fix github imports
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * fix interaction_score
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * remove attribute error handling
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * remove second attribute error handling block
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * change api endpoint for pull requests in pr script in stats
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * reformat
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * Update dev/stats/get_important_pr_candidates.py
    
    Co-authored-by: Wei Lee <[email protected]>
    
    * reformat nested loop and add new line
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * consolidate funs to decrease number of api calls
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * add rate limit parameter and limit info to output
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * refactor after linting
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    * refactor after linting contd
    
    Signed-off-by: Michael Robinson <[email protected]>
    
    ---------
    
    Signed-off-by: Michael Robinson <[email protected]>
    Co-authored-by: Wei Lee <[email protected]>
---
 dev/stats/get_important_pr_candidates.py | 228 ++++++++++++++-----------------
 1 file changed, 100 insertions(+), 128 deletions(-)

diff --git a/dev/stats/get_important_pr_candidates.py 
b/dev/stats/get_important_pr_candidates.py
index 3aed16f13f..738eed53c2 100755
--- a/dev/stats/get_important_pr_candidates.py
+++ b/dev/stats/get_important_pr_candidates.py
@@ -33,7 +33,6 @@ from rich.console import Console
 
 logger = logging.getLogger(__name__)
 
-
 console = Console(width=400, color_system="standard")
 
 option_github_token = click.option(
@@ -53,7 +52,6 @@ option_github_token = click.option(
 class PrStat:
     PROVIDER_SCORE = 0.8
     REGULAR_SCORE = 1.0
-
     REVIEW_INTERACTION_VALUE = 2.0
     COMMENT_INTERACTION_VALUE = 1.0
     REACTION_INTERACTION_VALUE = 0.5
@@ -61,12 +59,19 @@ class PrStat:
     def __init__(self, g, pull_request: PullRequest):
         self.g = g
         self.pull_request = pull_request
+        self.title = pull_request.title
         self._users: set[str] = set()
+        self.len_comments: int = 0
+        self.comment_reactions: int = 0
         self.issue_nums: list[int] = []
-        self.len_issue_comments = 0
-        self.num_issue_comments = 0
-        self.num_issue_reactions = 0
-        self.protm_score = 0
+        self.len_issue_comments: int = 0
+        self.num_issue_comments: int = 0
+        self.num_issue_reactions: int = 0
+        self.num_comments: int = 0
+        self.num_conv_comments: int = 0
+        self.num_protm: int = 0
+        self.conv_comment_reactions: int = 0
+        self.interaction_score = 1.0
 
     @property
     def label_score(self) -> float:
@@ -77,53 +82,33 @@ class PrStat:
                 return PrStat.PROVIDER_SCORE
         return PrStat.REGULAR_SCORE
 
-    @cached_property
-    def num_comments(self):
-        """counts reviewer comments & checks for #protm tag"""
-        num_comments = 0
-        num_protm = 0
+    def calc_comments(self):
+        """counts reviewer comments, checks for #protm tag, counts rxns"""
         for comment in self.pull_request.get_comments():
             self._users.add(comment.user.login)
             lowercase_body = comment.body.lower()
             if "protm" in lowercase_body:
-                num_protm += 1
-            num_comments += 1
-        self.protm_score = num_protm
-        return num_comments
+                self.num_protm += 1
+            self.num_comments += 1
+            if comment.body is not None:
+                self.len_comments += len(comment.body)
+            for reaction in comment.get_reactions():
+                self._users.add(reaction.user.login)
+                self.comment_reactions += 1
 
-    @cached_property
-    def num_conv_comments(self) -> int:
-        """counts conversational comments & checks for #protm tag"""
-        num_conv_comments = 0
-        num_protm = 0
+    def calc_conv_comments(self):
+        """counts conversational comments, checks for #protm tag, counts 
rxns"""
         for conv_comment in self.pull_request.get_issue_comments():
             self._users.add(conv_comment.user.login)
             lowercase_body = conv_comment.body.lower()
             if "protm" in lowercase_body:
-                num_protm += 1
-            num_conv_comments += 1
-        self.protm_score = num_protm
-        return num_conv_comments
-
-    @cached_property
-    def num_reactions(self) -> int:
-        """counts reactions to reviewer comments"""
-        reactions = 0
-        for comment in self.pull_request.get_comments():
-            for reaction in comment.get_reactions():
-                self._users.add(reaction.user.login)
-                reactions += 1
-        return reactions
-
-    @cached_property
-    def num_conv_reactions(self) -> int:
-        """counts reactions to conversational comments"""
-        reactions = 0
-        for conv_comment in self.pull_request.get_issue_comments():
+                self.num_protm += 1
+            self.num_conv_comments += 1
             for reaction in conv_comment.get_reactions():
                 self._users.add(reaction.user.login)
-                reactions += 1
-        return reactions
+                self.conv_comment_reactions += 1
+            if conv_comment.body is not None:
+                self.len_issue_comments += len(conv_comment.body)
 
     @cached_property
     def num_reviews(self) -> int:
@@ -134,22 +119,17 @@ class PrStat:
             num_reviews += 1
         return num_reviews
 
-    @cached_property
     def issues(self):
         """finds issues in PR"""
         if self.pull_request.body is not None:
             regex = r"(?<=closes: #|elated: #)\d{5}"
             issue_strs = re.findall(regex, self.pull_request.body)
-            issue_ints = [eval(s) for s in issue_strs]
-            self.issue_nums = issue_ints
-            return issue_ints
+            self.issue_nums = [eval(s) for s in issue_strs]
 
-    @cached_property
-    def issue_reactions(self) -> int:
+    def issue_reactions(self):
         """counts reactions to issue comments"""
         if self.issue_nums:
             repo = self.g.get_repo("apache/airflow")
-            issue_reactions = 0
             for num in self.issue_nums:
                 try:
                     issue = repo.get_issue(num)
@@ -157,43 +137,22 @@ class PrStat:
                     continue
                 for reaction in issue.get_reactions():
                     self._users.add(reaction.user.login)
-                    issue_reactions += 1
-            self.num_issue_reactions = issue_reactions
-            return issue_reactions
-        return 0
-
-    @cached_property
-    def issue_comments(self) -> int:
-        """counts issue comments and calculates comment length"""
-        if self.issue_nums:
-            repo = self.g.get_repo("apache/airflow")
-            issue_comments = 0
-            len_issue_comments = 0
-            for num in self.issue_nums:
-                try:
-                    issue = repo.get_issue(num)
-                except UnknownObjectException:
-                    continue
+                    self.num_issue_reactions += 1
                 for issue_comment in issue.get_comments():
-                    issue_comments += 1
+                    self.num_issue_comments += 1
                     self._users.add(issue_comment.user.login)
                     if issue_comment.body is not None:
-                        len_issue_comments += len(issue_comment.body)
-            self.len_issue_comments = len_issue_comments
-            self.num_issue_comments = issue_comments
-            return issue_comments
-        return 0
+                        self.len_issue_comments += len(issue_comment.body)
 
-    @property
-    def interaction_score(self) -> float:
+    def calc_interaction_score(self):
+        """calculates interaction score"""
         interactions = (
-            self.num_comments + self.num_conv_comments + self.issue_comments
+            self.num_comments + self.num_conv_comments + 
self.num_issue_comments
         ) * PrStat.COMMENT_INTERACTION_VALUE
         interactions += (
-            self.num_reactions + self.num_conv_reactions + self.issue_reactions
+            self.comment_reactions + self.conv_comment_reactions + 
self.num_issue_reactions
         ) * PrStat.REACTION_INTERACTION_VALUE
-        interactions += self.num_reviews * PrStat.REVIEW_INTERACTION_VALUE
-        return interactions
+        self.interaction_score += interactions + self.num_reviews * 
PrStat.REVIEW_INTERACTION_VALUE
 
     @cached_property
     def num_interacting_users(self) -> int:
@@ -232,25 +191,18 @@ class PrStat:
 
     @cached_property
     def comment_length(self) -> int:
-        length = 0
-        for comment in self.pull_request.get_comments():
-            if comment.body is not None:
-                length += len(comment.body)
+        rev_length = 0
         for comment in self.pull_request.get_review_comments():
             if comment.body is not None:
-                length += len(comment.body)
-        for conv_comment in self.pull_request.get_issue_comments():
-            if conv_comment.body is not None:
-                length += len(conv_comment.body)
-        length += self.len_issue_comments
-        return length
+                rev_length += len(comment.body)
+        return self.len_comments + self.len_issue_comments + rev_length
 
     @property
     def length_score(self) -> float:
         score = 1.0
-        if self.comment_length > 3000:
+        if self.len_comments > 3000:
             score *= 1.3
-        if self.comment_length < 200:
+        if self.len_comments < 200:
             score *= 0.8
         if self.body_length > 2000:
             score *= 1.4
@@ -260,6 +212,9 @@ class PrStat:
             score *= 0.4
         return round(score, 3)
 
+    def adjust_interaction_score(self):
+        self.interaction_score *= min(self.num_protm + 1, 3)
+
     @property
     def score(self):
         #
@@ -281,12 +236,15 @@ class PrStat:
         #
         # Weight PRs with protm tags more heavily:
         # If there is at least one protm tag, multiply the interaction score 
by the number of tags, up to 3.
-        interaction_score = self.interaction_score
-        interaction_score *= min(self.protm_score + 1, 3)
+        #
+        self.calc_comments()
+        self.calc_conv_comments()
+        self.calc_interaction_score()
+        self.adjust_interaction_score()
 
         return round(
             1.0
-            * interaction_score
+            * self.interaction_score
             * self.label_score
             * self.length_score
             * self.change_score
@@ -295,7 +253,7 @@ class PrStat:
         )
 
     def __str__(self) -> str:
-        if self.protm_score > 0:
+        if self.num_protm > 0:
             return (
                 "[magenta]##Tagged PR## [/]"
                 f"Score: {self.score:.2f}: PR{self.pull_request.number}"
@@ -312,7 +270,7 @@ class PrStat:
             )
 
     def verboseStr(self) -> str:
-        if self.protm_score > 0:
+        if self.num_protm > 0:
             console.print("********************* Tagged with '#protm' 
*********************", style="magenta")
         return (
             f"-- Created at [bright_blue]{self.pull_request.created_at}[/], "
@@ -320,14 +278,14 @@ class PrStat:
             f"-- Label score: [green]{self.label_score}[/]\n"
             f"-- Length score: [green]{self.length_score}[/] "
             f"(body length: {self.body_length}, "
-            f"comment length: {self.comment_length})\n"
+            f"comment length: {self.len_comments})\n"
             f"-- Interaction score: [green]{self.interaction_score}[/] "
             f"(users interacting: {self.num_interacting_users}, "
             f"reviews: {self.num_reviews}, "
             f"review comments: {self.num_comments}, "
-            f"review reactions: {self.num_reactions}, "
+            f"review reactions: {self.comment_reactions}, "
             f"non-review comments: {self.num_conv_comments}, "
-            f"non-review reactions: {self.num_conv_reactions}, "
+            f"non-review reactions: {self.conv_comment_reactions}, "
             f"issue comments: {self.num_issue_comments}, "
             f"issue reactions: {self.num_issue_reactions})\n"
             f"-- Change score: [green]{self.change_score}[/] "
@@ -359,6 +317,11 @@ DEFAULT_TOP_PRS = 10
 @click.option("--save", type=click.File("wb"), help="Save PR data to a pickle 
file")
 @click.option("--load", type=click.File("rb"), help="Load PR data from a file 
and recalculate scores")
 @click.option("--verbose", is_flag="True", help="Print scoring details")
[email protected](
+    "--rate-limit",
+    is_flag="True",
+    help="Print API rate limit reset time using system time, and requests 
remaining",
+)
 def main(
     github_token: str,
     date_start: datetime,
@@ -367,67 +330,76 @@ def main(
     date_end: datetime,
     top_number: int,
     verbose: bool,
+    rate_limit: bool,
 ):
+    g = Github(github_token)
+
+    if rate_limit:
+        r = g.get_rate_limit()
+        requests_remaining: int = r.core.remaining
+        console.print(
+            f"[blue]GitHub API Rate Limit Info\n"
+            f"[green]Requests remaining: [red]{requests_remaining}\n"
+            f"[green]Reset time: [blue]{r.core.reset.astimezone()}"
+        )
+
     selected_prs: list[PrStat] = []
     if load:
         console.print("Loading PRs from cache and recalculating scores.")
         selected_prs = pickle.load(load, encoding="bytes")
         issue_num = 0
-        for pr_stat in selected_prs:
+        for pr in selected_prs:
             issue_num += 1
             console.print(
-                f"[green]Loading PR: #{pr_stat.pull_request.number} 
`{pr_stat.pull_request.title}`.[/]"
-                f" Score: {pr_stat.score}."
-                f" Url: {pr_stat.pull_request.html_url}"
+                f"[green]Loading PR: #{pr.pull_request.number} 
`{pr.pull_request.title}`.[/]"
+                f" Score: {pr.score}."
+                f" Url: {pr.pull_request.html_url}"
             )
 
             if verbose:
-                console.print(pr_stat.verboseStr())
+                console.print(pr.verboseStr())
 
     else:
         console.print(f"Finding best candidate PRs between {date_start} and 
{date_end}.")
-        g = Github(github_token)
         repo = g.get_repo("apache/airflow")
-        pulls = repo.get_pulls(state="closed", sort="created", 
direction="desc")
+        commits = repo.get_commits(since=date_start, until=date_end)
+        pulls: list[PullRequest] = [pull for commit in commits for pull in 
commit.get_pulls()]
         issue_num = 0
-        for pr in pulls:
-            if not pr.merged:
-                continue
-
-            if not (date_start < pr.merged_at < date_end):
-                console.print(
-                    f"[bright_blue]Skipping {pr.number} {pr.title} as it was 
not "
-                    f"merged between {date_start} and {date_end}]"
-                )
-                continue
-
-            if pr.merged_at < date_start:
-                console.print("[bright_blue]Completed selecting candidates")
-                break
-
+        scores: dict = {}
+        for pull in pulls:
+            p = PrStat(g=g, pull_request=pull)  # type: ignore
+            scores.update({pull.number: [p.score, pull.title]})
             issue_num += 1
-            pr_stat = PrStat(pull_request=pr, g=g)  # type: ignore
             console.print(
-                f"[green]Selecting PR: #{pr.number} `{pr.title}` as 
candidate.[/]"
-                f" Score: {pr_stat.score}."
-                f" Url: {pr.html_url}"
+                f"[green]Selecting PR: #{pull.number} `{pull.title}` as 
candidate.[/]"
+                f" Score: {scores[pull.number][0]}."
+                f" Url: {pull.html_url}"
             )
 
             if verbose:
-                console.print(pr_stat.verboseStr())
+                console.print(p.verboseStr())
 
-            selected_prs.append(pr_stat)
+            selected_prs.append(p)
             if issue_num == MAX_PR_CANDIDATES:
                 console.print(f"[red]Reached {MAX_PR_CANDIDATES}. Stopping")
                 break
 
     console.print(f"Top {top_number} out of {issue_num} PRs:")
-    for pr_stat in sorted(selected_prs, key=lambda s: -s.score)[:top_number]:
-        console.print(f" * {pr_stat}")
+    for pr_scored in sorted(scores.items(), key=lambda s: s[1], 
reverse=True)[:top_number]:
+        console.print(f"[green] * PR #{pr_scored[0]}: {pr_scored[1][1]}. 
Score: [magenta]{pr_scored[1][0]}")
 
     if save:
         pickle.dump(selected_prs, save)
 
+    if rate_limit:
+        r = g.get_rate_limit()
+        console.print(
+            f"[blue]GitHub API Rate Limit Info\n"
+            f"[green]Requests remaining: [red]{r.core.remaining}\n"
+            f"[green]Requests made: [red]{requests_remaining - 
r.core.remaining}\n"
+            f"[green]Reset time: [blue]{r.core.reset.astimezone()}"
+        )
+
 
 if __name__ == "__main__":
     main()

[airflow] branch main updated: improve performance of pr script in dev/stats (#32735)

Reply via email to