This is an automated email from the ASF dual-hosted git repository.
mobuchowski pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/airflow.git
The following commit(s) were added to refs/heads/main by this push:
new 6f75a8a917 improve performance of pr script in dev/stats (#32735)
6f75a8a917 is described below
commit 6f75a8a9172725f63b14b83b4b57c4a10a4286c8
Author: Michael Robinson <[email protected]>
AuthorDate: Mon Jul 24 09:00:57 2023 -0400
improve performance of pr script in dev/stats (#32735)
* add error handling in case of 404 errors to protm script
Signed-off-by: Michael Robinson <[email protected]>
* add error handling in case of missing score when using pickle file
Signed-off-by: Michael Robinson <[email protected]>
* reformat
Signed-off-by: Michael Robinson <[email protected]>
* add UknownObjectException class to error handling
Signed-off-by: Michael Robinson <[email protected]>
* fix github imports
Signed-off-by: Michael Robinson <[email protected]>
* fix interaction_score
Signed-off-by: Michael Robinson <[email protected]>
* remove attribute error handling
Signed-off-by: Michael Robinson <[email protected]>
* remove second attribute error handling block
Signed-off-by: Michael Robinson <[email protected]>
* change api endpoint for pull requests in pr script in stats
Signed-off-by: Michael Robinson <[email protected]>
* reformat
Signed-off-by: Michael Robinson <[email protected]>
* Update dev/stats/get_important_pr_candidates.py
Co-authored-by: Wei Lee <[email protected]>
* reformat nested loop and add new line
Signed-off-by: Michael Robinson <[email protected]>
* consolidate funs to decrease number of api calls
Signed-off-by: Michael Robinson <[email protected]>
* add rate limit parameter and limit info to output
Signed-off-by: Michael Robinson <[email protected]>
* refactor after linting
Signed-off-by: Michael Robinson <[email protected]>
* refactor after linting contd
Signed-off-by: Michael Robinson <[email protected]>
---------
Signed-off-by: Michael Robinson <[email protected]>
Co-authored-by: Wei Lee <[email protected]>
---
dev/stats/get_important_pr_candidates.py | 228 ++++++++++++++-----------------
1 file changed, 100 insertions(+), 128 deletions(-)
diff --git a/dev/stats/get_important_pr_candidates.py
b/dev/stats/get_important_pr_candidates.py
index 3aed16f13f..738eed53c2 100755
--- a/dev/stats/get_important_pr_candidates.py
+++ b/dev/stats/get_important_pr_candidates.py
@@ -33,7 +33,6 @@ from rich.console import Console
logger = logging.getLogger(__name__)
-
console = Console(width=400, color_system="standard")
option_github_token = click.option(
@@ -53,7 +52,6 @@ option_github_token = click.option(
class PrStat:
PROVIDER_SCORE = 0.8
REGULAR_SCORE = 1.0
-
REVIEW_INTERACTION_VALUE = 2.0
COMMENT_INTERACTION_VALUE = 1.0
REACTION_INTERACTION_VALUE = 0.5
@@ -61,12 +59,19 @@ class PrStat:
def __init__(self, g, pull_request: PullRequest):
self.g = g
self.pull_request = pull_request
+ self.title = pull_request.title
self._users: set[str] = set()
+ self.len_comments: int = 0
+ self.comment_reactions: int = 0
self.issue_nums: list[int] = []
- self.len_issue_comments = 0
- self.num_issue_comments = 0
- self.num_issue_reactions = 0
- self.protm_score = 0
+ self.len_issue_comments: int = 0
+ self.num_issue_comments: int = 0
+ self.num_issue_reactions: int = 0
+ self.num_comments: int = 0
+ self.num_conv_comments: int = 0
+ self.num_protm: int = 0
+ self.conv_comment_reactions: int = 0
+ self.interaction_score = 1.0
@property
def label_score(self) -> float:
@@ -77,53 +82,33 @@ class PrStat:
return PrStat.PROVIDER_SCORE
return PrStat.REGULAR_SCORE
- @cached_property
- def num_comments(self):
- """counts reviewer comments & checks for #protm tag"""
- num_comments = 0
- num_protm = 0
+ def calc_comments(self):
+ """counts reviewer comments, checks for #protm tag, counts rxns"""
for comment in self.pull_request.get_comments():
self._users.add(comment.user.login)
lowercase_body = comment.body.lower()
if "protm" in lowercase_body:
- num_protm += 1
- num_comments += 1
- self.protm_score = num_protm
- return num_comments
+ self.num_protm += 1
+ self.num_comments += 1
+ if comment.body is not None:
+ self.len_comments += len(comment.body)
+ for reaction in comment.get_reactions():
+ self._users.add(reaction.user.login)
+ self.comment_reactions += 1
- @cached_property
- def num_conv_comments(self) -> int:
- """counts conversational comments & checks for #protm tag"""
- num_conv_comments = 0
- num_protm = 0
+ def calc_conv_comments(self):
+ """counts conversational comments, checks for #protm tag, counts
rxns"""
for conv_comment in self.pull_request.get_issue_comments():
self._users.add(conv_comment.user.login)
lowercase_body = conv_comment.body.lower()
if "protm" in lowercase_body:
- num_protm += 1
- num_conv_comments += 1
- self.protm_score = num_protm
- return num_conv_comments
-
- @cached_property
- def num_reactions(self) -> int:
- """counts reactions to reviewer comments"""
- reactions = 0
- for comment in self.pull_request.get_comments():
- for reaction in comment.get_reactions():
- self._users.add(reaction.user.login)
- reactions += 1
- return reactions
-
- @cached_property
- def num_conv_reactions(self) -> int:
- """counts reactions to conversational comments"""
- reactions = 0
- for conv_comment in self.pull_request.get_issue_comments():
+ self.num_protm += 1
+ self.num_conv_comments += 1
for reaction in conv_comment.get_reactions():
self._users.add(reaction.user.login)
- reactions += 1
- return reactions
+ self.conv_comment_reactions += 1
+ if conv_comment.body is not None:
+ self.len_issue_comments += len(conv_comment.body)
@cached_property
def num_reviews(self) -> int:
@@ -134,22 +119,17 @@ class PrStat:
num_reviews += 1
return num_reviews
- @cached_property
def issues(self):
"""finds issues in PR"""
if self.pull_request.body is not None:
regex = r"(?<=closes: #|elated: #)\d{5}"
issue_strs = re.findall(regex, self.pull_request.body)
- issue_ints = [eval(s) for s in issue_strs]
- self.issue_nums = issue_ints
- return issue_ints
+ self.issue_nums = [eval(s) for s in issue_strs]
- @cached_property
- def issue_reactions(self) -> int:
+ def issue_reactions(self):
"""counts reactions to issue comments"""
if self.issue_nums:
repo = self.g.get_repo("apache/airflow")
- issue_reactions = 0
for num in self.issue_nums:
try:
issue = repo.get_issue(num)
@@ -157,43 +137,22 @@ class PrStat:
continue
for reaction in issue.get_reactions():
self._users.add(reaction.user.login)
- issue_reactions += 1
- self.num_issue_reactions = issue_reactions
- return issue_reactions
- return 0
-
- @cached_property
- def issue_comments(self) -> int:
- """counts issue comments and calculates comment length"""
- if self.issue_nums:
- repo = self.g.get_repo("apache/airflow")
- issue_comments = 0
- len_issue_comments = 0
- for num in self.issue_nums:
- try:
- issue = repo.get_issue(num)
- except UnknownObjectException:
- continue
+ self.num_issue_reactions += 1
for issue_comment in issue.get_comments():
- issue_comments += 1
+ self.num_issue_comments += 1
self._users.add(issue_comment.user.login)
if issue_comment.body is not None:
- len_issue_comments += len(issue_comment.body)
- self.len_issue_comments = len_issue_comments
- self.num_issue_comments = issue_comments
- return issue_comments
- return 0
+ self.len_issue_comments += len(issue_comment.body)
- @property
- def interaction_score(self) -> float:
+ def calc_interaction_score(self):
+ """calculates interaction score"""
interactions = (
- self.num_comments + self.num_conv_comments + self.issue_comments
+ self.num_comments + self.num_conv_comments +
self.num_issue_comments
) * PrStat.COMMENT_INTERACTION_VALUE
interactions += (
- self.num_reactions + self.num_conv_reactions + self.issue_reactions
+ self.comment_reactions + self.conv_comment_reactions +
self.num_issue_reactions
) * PrStat.REACTION_INTERACTION_VALUE
- interactions += self.num_reviews * PrStat.REVIEW_INTERACTION_VALUE
- return interactions
+ self.interaction_score += interactions + self.num_reviews *
PrStat.REVIEW_INTERACTION_VALUE
@cached_property
def num_interacting_users(self) -> int:
@@ -232,25 +191,18 @@ class PrStat:
@cached_property
def comment_length(self) -> int:
- length = 0
- for comment in self.pull_request.get_comments():
- if comment.body is not None:
- length += len(comment.body)
+ rev_length = 0
for comment in self.pull_request.get_review_comments():
if comment.body is not None:
- length += len(comment.body)
- for conv_comment in self.pull_request.get_issue_comments():
- if conv_comment.body is not None:
- length += len(conv_comment.body)
- length += self.len_issue_comments
- return length
+ rev_length += len(comment.body)
+ return self.len_comments + self.len_issue_comments + rev_length
@property
def length_score(self) -> float:
score = 1.0
- if self.comment_length > 3000:
+ if self.len_comments > 3000:
score *= 1.3
- if self.comment_length < 200:
+ if self.len_comments < 200:
score *= 0.8
if self.body_length > 2000:
score *= 1.4
@@ -260,6 +212,9 @@ class PrStat:
score *= 0.4
return round(score, 3)
+ def adjust_interaction_score(self):
+ self.interaction_score *= min(self.num_protm + 1, 3)
+
@property
def score(self):
#
@@ -281,12 +236,15 @@ class PrStat:
#
# Weight PRs with protm tags more heavily:
# If there is at least one protm tag, multiply the interaction score
by the number of tags, up to 3.
- interaction_score = self.interaction_score
- interaction_score *= min(self.protm_score + 1, 3)
+ #
+ self.calc_comments()
+ self.calc_conv_comments()
+ self.calc_interaction_score()
+ self.adjust_interaction_score()
return round(
1.0
- * interaction_score
+ * self.interaction_score
* self.label_score
* self.length_score
* self.change_score
@@ -295,7 +253,7 @@ class PrStat:
)
def __str__(self) -> str:
- if self.protm_score > 0:
+ if self.num_protm > 0:
return (
"[magenta]##Tagged PR## [/]"
f"Score: {self.score:.2f}: PR{self.pull_request.number}"
@@ -312,7 +270,7 @@ class PrStat:
)
def verboseStr(self) -> str:
- if self.protm_score > 0:
+ if self.num_protm > 0:
console.print("********************* Tagged with '#protm'
*********************", style="magenta")
return (
f"-- Created at [bright_blue]{self.pull_request.created_at}[/], "
@@ -320,14 +278,14 @@ class PrStat:
f"-- Label score: [green]{self.label_score}[/]\n"
f"-- Length score: [green]{self.length_score}[/] "
f"(body length: {self.body_length}, "
- f"comment length: {self.comment_length})\n"
+ f"comment length: {self.len_comments})\n"
f"-- Interaction score: [green]{self.interaction_score}[/] "
f"(users interacting: {self.num_interacting_users}, "
f"reviews: {self.num_reviews}, "
f"review comments: {self.num_comments}, "
- f"review reactions: {self.num_reactions}, "
+ f"review reactions: {self.comment_reactions}, "
f"non-review comments: {self.num_conv_comments}, "
- f"non-review reactions: {self.num_conv_reactions}, "
+ f"non-review reactions: {self.conv_comment_reactions}, "
f"issue comments: {self.num_issue_comments}, "
f"issue reactions: {self.num_issue_reactions})\n"
f"-- Change score: [green]{self.change_score}[/] "
@@ -359,6 +317,11 @@ DEFAULT_TOP_PRS = 10
@click.option("--save", type=click.File("wb"), help="Save PR data to a pickle
file")
@click.option("--load", type=click.File("rb"), help="Load PR data from a file
and recalculate scores")
@click.option("--verbose", is_flag="True", help="Print scoring details")
[email protected](
+ "--rate-limit",
+ is_flag="True",
+ help="Print API rate limit reset time using system time, and requests
remaining",
+)
def main(
github_token: str,
date_start: datetime,
@@ -367,67 +330,76 @@ def main(
date_end: datetime,
top_number: int,
verbose: bool,
+ rate_limit: bool,
):
+ g = Github(github_token)
+
+ if rate_limit:
+ r = g.get_rate_limit()
+ requests_remaining: int = r.core.remaining
+ console.print(
+ f"[blue]GitHub API Rate Limit Info\n"
+ f"[green]Requests remaining: [red]{requests_remaining}\n"
+ f"[green]Reset time: [blue]{r.core.reset.astimezone()}"
+ )
+
selected_prs: list[PrStat] = []
if load:
console.print("Loading PRs from cache and recalculating scores.")
selected_prs = pickle.load(load, encoding="bytes")
issue_num = 0
- for pr_stat in selected_prs:
+ for pr in selected_prs:
issue_num += 1
console.print(
- f"[green]Loading PR: #{pr_stat.pull_request.number}
`{pr_stat.pull_request.title}`.[/]"
- f" Score: {pr_stat.score}."
- f" Url: {pr_stat.pull_request.html_url}"
+ f"[green]Loading PR: #{pr.pull_request.number}
`{pr.pull_request.title}`.[/]"
+ f" Score: {pr.score}."
+ f" Url: {pr.pull_request.html_url}"
)
if verbose:
- console.print(pr_stat.verboseStr())
+ console.print(pr.verboseStr())
else:
console.print(f"Finding best candidate PRs between {date_start} and
{date_end}.")
- g = Github(github_token)
repo = g.get_repo("apache/airflow")
- pulls = repo.get_pulls(state="closed", sort="created",
direction="desc")
+ commits = repo.get_commits(since=date_start, until=date_end)
+ pulls: list[PullRequest] = [pull for commit in commits for pull in
commit.get_pulls()]
issue_num = 0
- for pr in pulls:
- if not pr.merged:
- continue
-
- if not (date_start < pr.merged_at < date_end):
- console.print(
- f"[bright_blue]Skipping {pr.number} {pr.title} as it was
not "
- f"merged between {date_start} and {date_end}]"
- )
- continue
-
- if pr.merged_at < date_start:
- console.print("[bright_blue]Completed selecting candidates")
- break
-
+ scores: dict = {}
+ for pull in pulls:
+ p = PrStat(g=g, pull_request=pull) # type: ignore
+ scores.update({pull.number: [p.score, pull.title]})
issue_num += 1
- pr_stat = PrStat(pull_request=pr, g=g) # type: ignore
console.print(
- f"[green]Selecting PR: #{pr.number} `{pr.title}` as
candidate.[/]"
- f" Score: {pr_stat.score}."
- f" Url: {pr.html_url}"
+ f"[green]Selecting PR: #{pull.number} `{pull.title}` as
candidate.[/]"
+ f" Score: {scores[pull.number][0]}."
+ f" Url: {pull.html_url}"
)
if verbose:
- console.print(pr_stat.verboseStr())
+ console.print(p.verboseStr())
- selected_prs.append(pr_stat)
+ selected_prs.append(p)
if issue_num == MAX_PR_CANDIDATES:
console.print(f"[red]Reached {MAX_PR_CANDIDATES}. Stopping")
break
console.print(f"Top {top_number} out of {issue_num} PRs:")
- for pr_stat in sorted(selected_prs, key=lambda s: -s.score)[:top_number]:
- console.print(f" * {pr_stat}")
+ for pr_scored in sorted(scores.items(), key=lambda s: s[1],
reverse=True)[:top_number]:
+ console.print(f"[green] * PR #{pr_scored[0]}: {pr_scored[1][1]}.
Score: [magenta]{pr_scored[1][0]}")
if save:
pickle.dump(selected_prs, save)
+ if rate_limit:
+ r = g.get_rate_limit()
+ console.print(
+ f"[blue]GitHub API Rate Limit Info\n"
+ f"[green]Requests remaining: [red]{r.core.remaining}\n"
+ f"[green]Requests made: [red]{requests_remaining -
r.core.remaining}\n"
+ f"[green]Reset time: [blue]{r.core.reset.astimezone()}"
+ )
+
if __name__ == "__main__":
main()