uros-b commented on code in PR #56752:
URL: https://github.com/apache/spark/pull/56752#discussion_r3470508045


##########
dev/pr_merge_status.py:
##########
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Reports which apache/spark branches a pull request was merged to, so the 
caller does
+not have to reason about it by hand (see the "Checking PR Merge Status" 
section of
+AGENTS.md for why this is non-obvious):
+
+    $ dev/pr_merge_status.py 56356
+    PR #56356: [SPARK-57295][SQL] Make database location validation consistent 
...
+    merged: yes
+      master       9357bc9ae05
+      branch-4.x   72edddb358e
+
+Spark merges with dev/merge_spark_pr.py rather than the GitHub button, so a 
merged PR
+shows up as "Closed" (not "Merged") with empty merge metadata, and backports 
are plain
+cherry-pick pushes. The merge script writes a closing trailer "Closes #<pr> 
from
+<author>/<branch>" into the master commit, and every cherry-pick of it to a 
maintenance
+branch retains that trailer. This script keys on that trailer: a branch counts 
only
+when a commit carrying it is reachable from the branch AND has not been 
reverted there.
+It reports the mechanical facts only -- whether a change *should* have been 
backported
+to some branch is a judgement call and is NOT made here.
+
+Reads from a local git remote pointing at apache/spark (the `upstream` the 
AGENTS.md
+pre-flight has you configure). Run `git fetch <upstream>` first so the branch 
refs are
+current, otherwise a recently merged PR may look unmerged.
+
+Usage: dev/pr_merge_status.py <pr-number>
+"""
+
+import re
+import subprocess
+import sys
+
+
+def git(*args):
+    """Runs a git command, exiting with its stderr on failure."""
+    result = subprocess.run(["git", *args], capture_output=True, text=True, 
encoding="utf-8")
+    if result.returncode != 0:
+        sys.stderr.write(result.stderr)
+        sys.exit("error: command failed: git " + " ".join(args))
+    return result.stdout
+
+
+def detect_remote():
+    """Returns the name of the git remote pointing to apache/spark, or None."""
+    for line in git("remote", "-v").splitlines():
+        parts = line.split()
+        if (
+            len(parts) >= 3
+            and parts[2] == "(fetch)"
+            and re.search(r"[:/]apache/spark(\.git)?$", parts[1])
+        ):
+            return parts[0]
+    return None
+
+
+def commits_with_trailer(trailer):
+    """Returns the full SHAs of all reachable commits whose message contains 
`trailer`."""
+    out = git("log", "--all", "--fixed-strings", "--grep", trailer, 
"--format=%H")
+    return out.split()
+
+
+def official_branches_containing(commit, remote):
+    """Returns the apache/spark branch names (e.g. 'master', 'branch-4.x') 
whose tip is a
+    descendant of `commit`, i.e. that contain it -- ignoring the remote's HEAD 
alias and
+    any non-branch refs."""
+    out = git(
+        "for-each-ref",
+        "--contains",
+        commit,
+        "--format=%(refname:short)",
+        "refs/remotes/%s" % remote,
+    )
+    prefix = remote + "/"
+    branches = []
+    for ref in out.splitlines():
+        if not ref.startswith(prefix):
+            continue
+        name = ref[len(prefix) :]
+        if name == "HEAD":
+            continue
+        branches.append(name)
+    return branches
+
+
+def revert_commits(commit):
+    """Returns the SHAs of commits that revert `commit` (git's `git revert` 
writes a
+    'This reverts commit <full-sha>' line into the revert commit's message)."""
+    out = git(
+        "log",
+        "--all",
+        "--fixed-strings",
+        "--grep",
+        "This reverts commit %s" % commit,
+        "--format=%H",
+    )
+    return out.split()
+
+
+def branch_sort_key(name):
+    """Sorts `master` first, then branch-<major>.<minor> ascending, with 
branch-<N>.x
+    (the active dev line for the next feature release) after its numeric 
siblings."""
+    if name == "master":
+        return (0, 0, 0)
+    match = re.fullmatch(r"branch-(\d+)\.(x|\d+)", name)
+    if match is None:
+        return (2, 0, name)
+    major = int(match.group(1))
+    minor = float("inf") if match.group(2) == "x" else int(match.group(2))
+    return (1, major, minor)
+
+
+def main():
+    if len(sys.argv) != 2:
+        sys.exit("Usage: dev/pr_merge_status.py <pr-number>")
+    pr = sys.argv[1].lstrip("#")
+    if not pr.isdigit():
+        sys.exit("error: <pr-number> must be a number, got %r" % sys.argv[1])
+
+    remote = detect_remote()
+    if remote is None:
+        sys.exit(
+            "error: no git remote points to apache/spark. Add one and retry:\n"
+            "    git remote add upstream https://github.com/apache/spark.git";
+        )
+
+    trailer = "Closes #%s from " % pr
+    # Map each official branch to the trailer-carrying commit that landed the 
PR there,
+    # plus whether that commit was later reverted on the branch. A PR on both 
master and
+    # a maintenance branch has a distinct commit per branch (the merge and its
+    # cherry-pick), each carrying the same trailer.
+    landed = {}
+    for commit in commits_with_trailer(trailer):
+        branches = official_branches_containing(commit, remote)
+        if not branches:
+            continue  # trailer commit only on a fork ref, not an apache/spark 
branch
+        reverts = revert_commits(commit)
+        for branch in branches:
+            reverted_by = next(
+                (r for r in reverts if branch in 
official_branches_containing(r, remote)),
+                None,
+            )
+            landed[branch] = (commit[:11], reverted_by[:11] if reverted_by 
else None)
+
+    if not landed:
+        print("PR #%s: not merged to any apache/spark branch" % pr)
+        print(
+            'No commit carrying the "%s" trailer is reachable from a branch on 
'
+            "remote '%s'.\nThe PR may be open/unmerged, or the branches may be 
stale -- "
+            "run `git fetch %s` and retry." % (trailer, remote, remote)
+        )
+        return
+
+    any_commit = next(iter(landed.values()))[0]
+    subject = git("log", "-1", "--format=%s", any_commit).strip()

Review Comment:
   Nit: an 11-char short SHA is passed to git log -1 --format=%s. Resolves fine 
at Spark's repo size but relies on an implementation detail; full SHA would be 
safer.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to