LuciferYang commented on code in PR #56026:
URL: https://github.com/apache/spark/pull/56026#discussion_r3282505190
##########
dev/merge_spark_pr.py:
##########
@@ -773,6 +773,169 @@ def standardize_jira_ref(text):
return clean_text
+# Recognized Spark component tags. A PR title must contain at least one of
+# these (within square brackets) to be merged. The list shown to the committer
+# in prompt_for_components is a subset of this set selected for prominence.
+# Status markers like [MINOR] or [FOLLOWUP] are intentionally absent and do
+# not satisfy the component requirement. [WIP] is also absent: a WIP PR
+# should be aborted at the earlier WIP warning, not merged.
+COMPONENT_MARKERS = frozenset(
+ {
+ "AVRO",
+ "BUILD",
+ "CONNECT",
+ "CORE",
+ "DOC",
+ "DOCS",
+ "GRAPHX",
+ "INFRA",
+ "K8S",
+ "ML",
+ "MLLIB",
+ "PROTOBUF",
+ "PS",
+ "PYTHON",
+ "R",
+ "REPL",
+ "SECURITY",
+ "SQL",
+ "SS",
+ "STREAMING",
+ "TESTS",
+ "UI",
+ "YARN",
+ }
+)
+
+
+def title_missing_component(title):
+ """
+ Return True when the PR title has no bracket whose content is in
+ COMPONENT_MARKERS.
+
+ A compliant Spark PR title looks like:
[SPARK-XXXXX][COMPONENT1][COMPONENT2] Rest,
+ where at least one [COMPONENT] is in COMPONENT_MARKERS. Status markers like
+ [MINOR] or [FOLLOWUP] are not in COMPONENT_MARKERS and don't satisfy the
+ requirement. Unknown tags like [FOO] also don't satisfy it. Revert PRs,
+ which reuse the original commit's title verbatim, are not flagged.
+
+ >>> title_missing_component("[SPARK-56853] Improve PATH Tests")
+ True
+ >>> title_missing_component("[SPARK-56853][SQL] Improve PATH Tests")
+ False
+ >>> title_missing_component("[SPARK-56853][SQL][TESTS] Improve PATH Tests")
+ False
+ >>> title_missing_component("[SPARK-1234][SPARK-5678] Title")
+ True
+ >>> title_missing_component("[SPARK-1234][SPARK-5678][CORE] Title")
+ False
+ >>> title_missing_component('Revert "[SPARK-48591][PYTHON] Simplify"')
+ False
+ >>> title_missing_component("Plain title with no brackets")
+ True
+ >>> title_missing_component("[SPARK-1234][MINOR] Fix typo")
+ True
+ >>> title_missing_component("[SPARK-1234][FOLLOWUP] Follow up")
+ True
+ >>> title_missing_component("[SPARK-1234][TRIVIAL] Trivial fix")
+ True
+ >>> title_missing_component("[SPARK-1234][SQL][MINOR] SQL typo fix")
+ False
+ >>> title_missing_component("[SPARK-1234][FOO] Unknown component")
+ True
+ >>> title_missing_component("[MINOR] Fix typo")
+ True
+ """
+ if title.startswith('Revert "') and title.endswith('"'):
+ return False
+ brackets = re.findall(r"\[([^\]]+)\]", title)
+ return not any(b.strip().upper() in COMPONENT_MARKERS for b in brackets)
+
+
+def insert_components_into_title(title, components):
+ """
+ Insert [COMPONENT1][COMPONENT2]... after the leading [SPARK-XXX] ref(s) in
title.
+ If the title has no leading SPARK ref, the component tag is prepended.
When the
+ remainder of the title starts with another [bracket] tag (e.g. an existing
+ sub-tag like [SDP]), the inserted block is joined to it without a space so
all
+ leading brackets stay adjacent.
+
+ >>> insert_components_into_title("[SPARK-56853] Improve PATH Tests",
["SQL"])
+ '[SPARK-56853][SQL] Improve PATH Tests'
+ >>> insert_components_into_title(
+ ... "[SPARK-1234][SPARK-5678] Title", ["CORE", "SQL"])
+ '[SPARK-1234][SPARK-5678][CORE][SQL] Title'
+ >>> insert_components_into_title("Random title", ["CORE"])
+ '[CORE] Random title'
+ >>> insert_components_into_title("[SPARK-56853]Improve PATH Tests",
["SQL"])
+ '[SPARK-56853][SQL] Improve PATH Tests'
+ >>> insert_components_into_title("[SPARK-56953][SDP] Implement", ["SQL"])
+ '[SPARK-56953][SQL][SDP] Implement'
+ >>> insert_components_into_title(
+ ... "[SPARK-56953][SDP] Implement", ["SQL", "CORE"])
+ '[SPARK-56953][SQL][CORE][SDP] Implement'
+ >>> insert_components_into_title("[MINOR] Fix typo", ["CORE"])
+ '[CORE][MINOR] Fix typo'
+ >>> insert_components_into_title("[MINOR] Fix typo", ["CORE", "SQL"])
+ '[CORE][SQL][MINOR] Fix typo'
+ """
+ component_tag = "".join("[%s]" % c for c in components)
+ m = re.match(r"^((?:\[SPARK-[0-9]{3,6}\])+)\s*(.*)$", title)
Review Comment:
```suggestion
m = re.match(r"^((?:\[SPARK-[0-9]+\])+)\s*(.*)$", title)
```
Current Spark JIRA numbers span 3-6 digits (e.g., legacy `SPARK-979` through
current `SPARK-56979`). Once SPARK numbers reach 7 digits, the `{3,6}` upper
bound silently breaks the matcher — `insert_components_into_title` falls
through to the no-SPARK-ref prepend branch and produces
`[COMPONENT][SPARK-1000000] Title` instead of `[SPARK-1000000][COMPONENT]
Title`. JIRA numbering is monotonically increasing, so absent any future fix to
this regex, the malformation is **inevitable** once that threshold is reached
(hence MEDIUM, not LOW). Drop the upper bound; the leading `[SPARK-` already
disambiguates from non-SPARK brackets.
##########
dev/merge_spark_pr.py:
##########
@@ -773,6 +773,169 @@ def standardize_jira_ref(text):
return clean_text
+# Recognized Spark component tags. A PR title must contain at least one of
+# these (within square brackets) to be merged. The list shown to the committer
+# in prompt_for_components is a subset of this set selected for prominence.
+# Status markers like [MINOR] or [FOLLOWUP] are intentionally absent and do
+# not satisfy the component requirement. [WIP] is also absent: a WIP PR
+# should be aborted at the earlier WIP warning, not merged.
+COMPONENT_MARKERS = frozenset(
+ {
+ "AVRO",
+ "BUILD",
+ "CONNECT",
+ "CORE",
+ "DOC",
+ "DOCS",
+ "GRAPHX",
+ "INFRA",
+ "K8S",
+ "ML",
+ "MLLIB",
+ "PROTOBUF",
+ "PS",
+ "PYTHON",
+ "R",
+ "REPL",
+ "SECURITY",
+ "SQL",
+ "SS",
+ "STREAMING",
+ "TESTS",
+ "UI",
+ "YARN",
+ }
+)
+
+
+def title_missing_component(title):
+ """
+ Return True when the PR title has no bracket whose content is in
+ COMPONENT_MARKERS.
+
+ A compliant Spark PR title looks like:
[SPARK-XXXXX][COMPONENT1][COMPONENT2] Rest,
+ where at least one [COMPONENT] is in COMPONENT_MARKERS. Status markers like
+ [MINOR] or [FOLLOWUP] are not in COMPONENT_MARKERS and don't satisfy the
+ requirement. Unknown tags like [FOO] also don't satisfy it. Revert PRs,
+ which reuse the original commit's title verbatim, are not flagged.
+
+ >>> title_missing_component("[SPARK-56853] Improve PATH Tests")
+ True
+ >>> title_missing_component("[SPARK-56853][SQL] Improve PATH Tests")
+ False
+ >>> title_missing_component("[SPARK-56853][SQL][TESTS] Improve PATH Tests")
+ False
+ >>> title_missing_component("[SPARK-1234][SPARK-5678] Title")
+ True
+ >>> title_missing_component("[SPARK-1234][SPARK-5678][CORE] Title")
+ False
+ >>> title_missing_component('Revert "[SPARK-48591][PYTHON] Simplify"')
+ False
+ >>> title_missing_component("Plain title with no brackets")
+ True
+ >>> title_missing_component("[SPARK-1234][MINOR] Fix typo")
+ True
+ >>> title_missing_component("[SPARK-1234][FOLLOWUP] Follow up")
+ True
+ >>> title_missing_component("[SPARK-1234][TRIVIAL] Trivial fix")
+ True
+ >>> title_missing_component("[SPARK-1234][SQL][MINOR] SQL typo fix")
+ False
+ >>> title_missing_component("[SPARK-1234][FOO] Unknown component")
+ True
+ >>> title_missing_component("[MINOR] Fix typo")
+ True
+ """
+ if title.startswith('Revert "') and title.endswith('"'):
+ return False
+ brackets = re.findall(r"\[([^\]]+)\]", title)
+ return not any(b.strip().upper() in COMPONENT_MARKERS for b in brackets)
+
+
+def insert_components_into_title(title, components):
+ """
+ Insert [COMPONENT1][COMPONENT2]... after the leading [SPARK-XXX] ref(s) in
title.
+ If the title has no leading SPARK ref, the component tag is prepended.
When the
+ remainder of the title starts with another [bracket] tag (e.g. an existing
+ sub-tag like [SDP]), the inserted block is joined to it without a space so
all
+ leading brackets stay adjacent.
+
+ >>> insert_components_into_title("[SPARK-56853] Improve PATH Tests",
["SQL"])
+ '[SPARK-56853][SQL] Improve PATH Tests'
+ >>> insert_components_into_title(
+ ... "[SPARK-1234][SPARK-5678] Title", ["CORE", "SQL"])
+ '[SPARK-1234][SPARK-5678][CORE][SQL] Title'
+ >>> insert_components_into_title("Random title", ["CORE"])
+ '[CORE] Random title'
+ >>> insert_components_into_title("[SPARK-56853]Improve PATH Tests",
["SQL"])
+ '[SPARK-56853][SQL] Improve PATH Tests'
+ >>> insert_components_into_title("[SPARK-56953][SDP] Implement", ["SQL"])
+ '[SPARK-56953][SQL][SDP] Implement'
+ >>> insert_components_into_title(
+ ... "[SPARK-56953][SDP] Implement", ["SQL", "CORE"])
+ '[SPARK-56953][SQL][CORE][SDP] Implement'
+ >>> insert_components_into_title("[MINOR] Fix typo", ["CORE"])
+ '[CORE][MINOR] Fix typo'
+ >>> insert_components_into_title("[MINOR] Fix typo", ["CORE", "SQL"])
+ '[CORE][SQL][MINOR] Fix typo'
+ """
+ component_tag = "".join("[%s]" % c for c in components)
+ m = re.match(r"^((?:\[SPARK-[0-9]{3,6}\])+)\s*(.*)$", title)
+ if m and m.group(1):
+ refs, rest = m.group(1), m.group(2)
+ if not rest:
+ return "%s%s" % (refs, component_tag)
+ sep = "" if rest.startswith("[") else " "
+ return "%s%s%s%s" % (refs, component_tag, sep, rest)
+ sep = "" if title.startswith("[") else " "
+ return "%s%s%s" % (component_tag, sep, title)
+
+
+def prompt_for_components(title):
+ """
+ Prompt the committer for component(s) when the PR title lacks a recognized
+ [COMPONENT] tag. Validates that at least one entered component is in
+ COMPONENT_MARKERS. Re-prompts until valid input is provided. Returns an
+ uppercase list.
+ """
+ print_error("PR title is missing a recognized [COMPONENT] tag.")
+ print("Common components:")
+ print(" [BUILD] - build system")
+ print(" [CONNECT] - Spark Connect")
+ print(" [CORE] - Spark Core")
+ print(" [DOCS] - documentation changes")
+ print(" [GRAPHX] - GraphX")
+ print(" [INFRA] - project infrastructure")
+ print(" [K8S] - Kubernetes")
+ print(" [ML] - Spark ML (DataFrame-based)")
+ print(" [MLLIB] - MLlib (RDD-based)")
+ print(" [PS] - pandas API on Spark")
+ print(" [PYTHON] - PySpark")
+ print(" [R] - SparkR")
+ print(" [SQL] - Spark SQL")
+ print(" [SS] - Structured Streaming")
+ print(" [STREAMING] - DStream API")
+ print(" [TESTS] - test-only changes")
+ print(" [UI] - Spark Web UI")
+ print(" [YARN] - Hadoop YARN")
+ print("Current title: %s" % title)
+ while True:
+ raw = bold_input(
+ "Enter comma-separated component(s) to insert into the title (e.g.
CORE,SQL): "
+ )
+ components = [c.strip().upper() for c in raw.split(",") if c.strip()]
Review Comment:
```suggestion
components = list(dict.fromkeys(
c.strip().upper() for c in raw.split(",") if c.strip()
))
```
Entering `"SQL,SQL"` produces `[SQL][SQL]` in the title with the current
code. `dict.fromkeys` deduplicates while preserving insertion order.
Complements the existing typo-warning concern raised by another reviewer.
##########
dev/merge_spark_pr.py:
##########
Review Comment:
It seems tags like `[PROJECT INFRA]` were originally supported. Will any
issues arise after this modification?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]