This is an automated email from the ASF dual-hosted git repository.

yihua pushed a commit to branch branch-0.x
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/branch-0.x by this push:
     new 4571b56f6808 chore: Update the outdated PR compliance script (#17883)
4571b56f6808 is described below

commit 4571b56f6808d19686f4dd7808230bac2601b851
Author: Lin Liu <[email protected]>
AuthorDate: Wed Jan 14 18:01:23 2026 -0800

    chore: Update the outdated PR compliance script (#17883)
---
 .github/workflows/pr_compliance.yml |   9 +-
 scripts/pr_compliance.py            | 287 +++++++++++++++++-------------------
 2 files changed, 138 insertions(+), 158 deletions(-)

diff --git a/.github/workflows/pr_compliance.yml 
b/.github/workflows/pr_compliance.yml
index 104a933db7d0..a16de677923f 100644
--- a/.github/workflows/pr_compliance.yml
+++ b/.github/workflows/pr_compliance.yml
@@ -1,4 +1,4 @@
-name: validate pr
+name: PR Compliance
 on:
   pull_request:
     types: [opened, edited, reopened, synchronize]
@@ -6,6 +6,10 @@ on:
       - master
       - branch-0.x
 
+concurrency:
+  group: pr-compliance-${{ github.ref }}
+  cancel-in-progress: ${{ !contains(github.ref, 'master') && 
!contains(github.ref, 'branch-0.x') }}
+
 jobs:
   validate-pr:
     runs-on: ubuntu-latest
@@ -17,6 +21,3 @@ jobs:
       - uses: actions/setup-python@v3
       - name: run script
         run: python3 scripts/pr_compliance.py
-
-
-
diff --git a/scripts/pr_compliance.py b/scripts/pr_compliance.py
index dcd3c4c0caf4..720529c6f70d 100644
--- a/scripts/pr_compliance.py
+++ b/scripts/pr_compliance.py
@@ -15,88 +15,14 @@
 # limitations under the License.
 #
 
-import re
+import inspect
 import os
+import re
 import sys
-import inspect
-# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #  
-#  _____ _ _   _       __     __    _ _     _       _   _                 #
-# |_   _(_) |_| | ___  \ \   / /_ _| (_) __| | __ _| |_(_) ___  _ __      #  
-#   | | | | __| |/ _ \  \ \ / / _` | | |/ _` |/ _` | __| |/ _ \| '_ \     #
-#   | | | | |_| |  __/   \ V / (_| | | | (_| | (_| | |_| | (_) | | | |    #
-#   |_| |_|\__|_|\___|    \_/ \__,_|_|_|\__,_|\__,_|\__|_|\___/|_| |_|    #
-#                                                                         # 
-# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #    
                                                             
 
 
-#validator for titles
-def validate_title(title: str):
-    return len(re.findall('(\[HUDI\-[0-9]{1,}\]|\[MINOR\])',title)) == 1
-
-#runs an individual title test
-#
-#   PARAMS
-# name: str - the name of the test
-# title: str - the title to test
-# isTrue: bool - is the title valid
-#
-#   RETURN
-# bool - True if the test passed, False if it failed
-def run_title_test(name: str, title: str, isTrue: bool):
-    if isTrue != validate_title(title):
-        print(f"{name} - FAILED")
-        return False
-    print(f"{name} - PASSED")
-    return True
-
-#tests for title validation
-#
-#   RETURN
-# bool - True if all tests passed, False if any tests fail
-def test_title():
-    test_return = True
-    #test that position doesn't matter for issue
-    test_return = run_title_test("issue at front", "[HUDI-1324] my fake pr", 
True) and test_return
-    test_return =  run_title_test("issue in middle", " my [HUDI-1324] fake 
pr", True) and test_return
-    test_return =  run_title_test("issue at end", " my fake pr [HUDI-1324]", 
True)  and test_return
-
-    #test position doesn't matter for minor
-    test_return = run_title_test("minor at front", "[MINOR] my fake pr", True) 
and test_return
-    test_return = run_title_test("minor in middle", " my [MINOR] fake pr", 
True) and test_return
-    test_return = run_title_test("minor at end", " my fake pr [MINOR]", True) 
and test_return
-
-    #test that more than 4 nums is also ok
-    test_return = run_title_test("more than 4 nums in issue", "[HUDI-12345] my 
fake pr", True) and test_return
-
-    #test that 1 nums is also ok
-    test_return = run_title_test("1 num in issue", "[HUDI-1] my fake pr", 
True) and test_return
-
-    #no nums not ok
-    test_return = run_title_test("no nums in issue", "[HUDI-] my fake pr", 
False) and test_return
-
-    #no brackets not ok
-    test_return = run_title_test("no brackets around issue", "HUDI-1234 my 
fake pr", False) and test_return
-    test_return = run_title_test("no brackets around minor", "MINOR my fake 
pr", False) and test_return
-    
-    #lowercase not ok
-    test_return = run_title_test("lowercase hudi", "[hudi-1234] my fake pr", 
False) and test_return
-    test_return = run_title_test("lowercase minor", "[minor] my fake pr", 
False) and test_return
-    
-    #duplicate not ok
-    test_return = run_title_test("duplicate issue", "[HUDI-1324][HUDI-1324] my 
fake pr", False) and test_return
-    test_return = run_title_test("duplicate minor", "[MINOR] my fake pr 
[MINOR]", False) and test_return
-    
-    #hudi and minor not ok
-    test_return = run_title_test("issue and minor", "[HUDI-1324] my 
[MINOR]fake pr", False) and test_return
-    print("*****")
-    if test_return:
-        print("All title tests passed")
-    else:
-        print("Some title tests failed")
-    print("*****")
-
-    return test_return
-
+# PR titles are now validated by the separate GitHub Action workflow
+# This script only validates PR body content and GitHub issue links for fix PRs
 
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 #  ____            _        __     __    _ _     _       _   _                #
@@ -106,8 +32,8 @@ def test_title():
 # |____/ \___/ \__,_|\__, |    \_/ \__,_|_|_|\__,_|\__,_|\__|_|\___/|_| |_|   #
 #                    |___/                                                    #
 #                                                                             #
-# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
#                                            
-    
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+
 #Enums for the outcome of parsing a single line
 class Outcomes:
     #error was found so we should stop parsing and exit with error
@@ -116,7 +42,7 @@ class Outcomes:
     #continue to parse the next line
     CONTINUE = 1
 
-    #All requirements for the current section have been met so we should start 
+    #All requirements for the current section have been met so we should start
     #parsing the next section
     NEXTSECTION = 2
 
@@ -128,7 +54,7 @@ class Outcomes:
 #   PARAMS
 # name: str - name of the parse section
 # identifier: str - line that signifies the start of a section
-# linesAfter: set of str - default lines in the template that we ignore when 
+# linesAfter: set of str - default lines in the template that we ignore when
 #                          verifying that the user filled out the section
 class ParseSectionData:
     def __init__(self, name: str, identifier: str, linesAfter: str):
@@ -137,13 +63,13 @@ class ParseSectionData:
         self.linesAfter = linesAfter
         self.prevSection = ""
         self.nextSection = ""
-        
-        
-    
+
+
+
     #returns true if line matches the identifier
     def identify(self, line: str):
         return line == self.identifier
-    
+
     #returns true if user has added new text to the section
     def identifyAfter(self, line: str):
         return line not in self.linesAfter
@@ -154,8 +80,8 @@ class ParseSectionData:
 class RiskLevelData(ParseSectionData):
     def __init__(self, name: str, identifier: str, linesAfter):
         super().__init__(name, identifier, linesAfter)
-    
-    #we check that the line start with the identifier because the identifier 
+
+    #we check that the line start with the identifier because the identifier
     #line will be modified when filled out correctly
     def identify(self, line: str):
         return line.startswith(self.identifier)
@@ -174,16 +100,16 @@ class ParseSections:
                 psd[i].prevSection = "START"
             else:
                 psd[i].prevSection = psd[prevI].name
-            
+
             if nextI >= len(psd):
                 psd[i].nextSection = "END"
             else:
                 psd[i].nextSection = psd[nextI].name
-            
+
             self.sections[psd[i].name] = psd[i]
-            
 
-    
+
+
     #returns true if line is an identifier for a section that is not value
     #   PARAMS
     # line: str - the line that we are parsing
@@ -193,8 +119,8 @@ class ParseSections:
             if name != value:
                 if self.sections[name].identify(line):
                     return True
-        return False    
-    
+        return False
+
     #gets the name of the section identified in the line
     #   PARAMS
     # line: str - the line that we are parsing
@@ -204,7 +130,7 @@ class ParseSections:
         for name in self.sections:
             if self.sections[name].identify(line):
                 return name
-        return None    
+        return None
 
 
     #returns the ParseSectionData that is named name
@@ -240,7 +166,7 @@ class ParseSection:
     def nextSection(self):
         return self.data.nextSection
 
-    #Returns true if we have already found the section identifier and line is 
+    #Returns true if we have already found the section identifier and line is
     #not in the default template
     def validateAfter(self, line):
         return self.found and self.data.identifyAfter(line)
@@ -250,7 +176,7 @@ class ParseSection:
         if self.found:
             #since we have already found the identifier, this means that we 
have
             #found a duplicate of the identifier
-            self.error(line, lineno, f"Duplicate {self.data.name} section 
found")
+            self.error(line, lineno, f"Duplicate {self.data.identifier} 
section found")
             return Outcomes.ERROR
         self.found = True
         return Outcomes.CONTINUE
@@ -260,9 +186,9 @@ class ParseSection:
             if self.nextSection() != "END" and 
self.sections.sections[self.nextSection()].identify(line):
                 #we found the next identifier but haven't found a description
                 #yet for this section
-                return f"Section {self.data.name} is missing a description"
+                return f"Section {self.data.identifier} is missing a 
description"
             #we found a section other than the next section
-            return f"Section {self.data.name} should be followed by section 
{self.data.nextSection}"
+            return f"Section {self.data.identifier} should be followed by 
section {self.data.nextSection}"
 
         #section identifier has not been found yet
         sectionFound = self.sections.getSectionName(line)
@@ -273,13 +199,13 @@ class ParseSection:
             #we have not found the current section identifier but we found the
             #previous section identifier again
             return f"Duplicate {self.data.prevSection} section found"
-         
+
         if self.data.prevSection == "START":
-            return f"Section {self.data.name} should be the first section"
+            return f"Section {self.data.identifier} should be the first 
section"
         if sectionFound == self.data.nextSection:
-            return f"Missing section {self.data.name} between 
{self.data.prevSection} and {self.data.nextSection}"
-        return f"Section {self.data.name} was expected after section 
{self.data.prevSection}"
-    
+            return f"Missing section {self.data.identifier} between 
{self.data.prevSection} and {self.data.nextSection}"
+        return f"Section {self.data.identifier} was expected after section 
{self.data.prevSection}"
+
     #Decides the outcome state by processing line
     def validateLine(self,line,lineno):
         if self.data.identify(line):
@@ -294,7 +220,7 @@ class ParseSection:
             #the pr author has added new text to this section so we consider it
             #to be filled out
             if self.nextSection() == "END":
-                #if next section is "END" then there are no more sections 
+                #if next section is "END" then there are no more sections
                 #to process
                 return Outcomes.SUCCESS
             return Outcomes.NEXTSECTION
@@ -347,7 +273,7 @@ class ValidateBody:
         if data is None:
             print(f"ERROR with your parse section setup. Parse section 
{sectionName} not found")
             exit(-3)
-        
+
         #create the section
         if data.name == "CHECKLIST":
             self.section = NoDataSection(data=data, sections=self.sections, 
debug=self.debug)
@@ -355,7 +281,7 @@ class ValidateBody:
             self.section = ParseSection(data=data, sections=self.sections, 
debug=self.debug)
 
     #Returns true if the body complies with the validation rules else false
-    def validate(self):
+    def validate(self, pr_title=None):
         #instantiate self.section since it starts null
         self.nextSection()
 
@@ -367,7 +293,7 @@ class ValidateBody:
 
             #run the parse section validation
             o = self.section.validateLine(line, lineno)
-            
+
             #decide what to do based on outcome
             if o == Outcomes.ERROR:
                 return False
@@ -375,43 +301,50 @@ class ValidateBody:
                 return True
             elif o == Outcomes.NEXTSECTION:
                 self.nextSection()
-        #if we get through all the lines without a success outcome, then the 
+        #if we get through all the lines without a success outcome, then the
         #body does not comply
         if self.section.data.nextSection == "END":
             if self.section.found:
-                self.section.error("","",f"Section {self.section.data.name} is 
missing a description")
+                self.section.error("", "", f"Section 
{self.section.data.identifier} is missing a description")
                 return False
-            self.section.error("","",f"Missing section 
{self.section.data.name} at the end")
+            self.section.error("", "", f"Missing section 
{self.section.data.identifier} at the end")
             return False
         self.section.error("","", "Please make sure you have filled out the 
template correctly. You can find a blank template in 
/.github/PULL_REQUEST_TEMPLATE.md")
         return False
-        
+
 #Generate the validator for the current template.
 #needs to be manually updated
 def make_default_validator(body, debug=False):
+    problemStatement = ParseSectionData("PROBLEM_STATEMENT",
+                                        "### Describe the issue this Pull 
Request addresses",
+                                        {"<!-- Either describe the issue 
inline here with motivation behind the changes",
+                                         "     (or) link to an issue by 
including `Closes #<issue-number>` for context.",
+                                         "      If this PR includes changes to 
the storage format, public APIs,",
+                                         "      or has breaking changes, use 
`!` (e.g., feat!: ...) -->"})
     changelogs = ParseSectionData("CHANGE_LOGS",
-        "### Change Logs",
-        {"_Describe context and summary for this change. Highlight if any code 
was copied._"})
+        "### Summary and Changelog",
+        {"<!-- Short, plain-English summary of what users gain or what changed 
in behavior.",
+         "     Followed by a detailed log of all the changes. Highlight if any 
code was copied. -->"})
     impact = ParseSectionData("IMPACT",
         "### Impact",
-        {"_Describe any public API or user-facing feature change or any 
performance impact._"})
+        {"<!-- Describe any public API or user-facing feature change or any 
performance impact. -->"})
     risklevel = RiskLevelData("RISK_LEVEL",
-        "### Risk level",
-        {"_If medium or high, explain what verification was done to mitigate 
the risks._"})
+        "### Risk Level",
+        {"<!-- Accepted values: none, low, medium or high. Other than `none`, 
explain the risk.",
+         "     If medium or high, explain what verification was done to 
mitigate the risks. -->"})
     docsUpdate = ParseSectionData("DOCUMENTATION_UPDATE",
         "### Documentation Update",
-        {"_Describe any necessary documentation update if there is any new 
feature, config, or user-facing change_",
+        {"<!-- Describe any necessary documentation update if there is any new 
feature, config, or user-facing change. If not, put \"none\".",
         "",
-        "- _The config description must be updated if new configs are added or 
the default value of the configs are changed. If not, put \"none\"._",
-        "- _Any new feature or user-facing change requires updating the Hudi 
website. Please create a Jira ticket, attach the",
-        "  ticket number here and follow the 
[instruction](https://hudi.apache.org/contribute/developer-setup#website) to 
make",
-        "  changes to the website._"})
+        "- The config description must be updated if new configs are added or 
the default value of the configs are changed.",
+        "- Any new feature or user-facing change requires updating the Hudi 
website. Please follow the",
+        "  
[instruction](https://hudi.apache.org/contribute/developer-setup#website) to 
make changes to the website. -->"})
     checklist = ParseSectionData("CHECKLIST",
         "### Contributor's checklist",
         {})
-    parseSections = ParseSections([changelogs, impact, risklevel, docsUpdate, 
checklist])
+    parseSections = ParseSections([problemStatement, changelogs, impact, 
risklevel, docsUpdate, checklist])
 
-    return ValidateBody(body, "CHANGE_LOGS", parseSections, debug)
+    return ValidateBody(body, "PROBLEM_STATEMENT", parseSections, debug)
 
 
 #takes a list of strings and returns a string of those lines separated by \n
@@ -424,9 +357,10 @@ def joinLines(lines):
 # body: str - the body to parse
 # isTrue: bool - True if the body complies with our validation rules
 # debug: bool - True if we want to print debug information
-def run_test(name: str, body: str, isTrue: bool, debug: bool):
+# title: str - the PR title (optional, for fix PR validation)
+def run_test(name: str, body: str, isTrue: bool, debug: bool, title: str = 
None):
     validator = make_default_validator(body, debug)
-    if isTrue != validator.validate():
+    if isTrue != validator.validate(title):
         print(f"{name} - FAILED")
         return False
     print(f"{name} - PASSED")
@@ -444,10 +378,24 @@ def build_body(sections):
 def test_body():
     DEBUG_MESSAGES = False
     #Create sections that we will combine to create bodies to test validation 
on
+    template_problem_statement = [
+        "### Describe the issue this Pull Request addresses",
+        "",
+        "<!-- Either describe the issue inline here with motivation behind the 
changes",
+        "     (or) link to an issue by including `Closes #<issue-number>` for 
context.",
+        "      If this PR includes changes to the storage format, public 
APIs,",
+        "      or has breaking changes, use `!` (e.g., feat!: ...) -->",
+        ""
+    ]
+
+    good_problem_statement = template_problem_statement.copy()
+    good_problem_statement[1] = "problem statement description"
+
     template_changelogs = [
-         "### Change Logs",
+         "### Summary and Changelog",
         "",
-        "_Describe context and summary for this change. Highlight if any code 
was copied._",
+        "<!-- Short, plain-English summary of what users gain or what changed 
in behavior.",
+        "     Followed by a detailed log of all the changes. Highlight if any 
code was copied. -->",
         ""
     ]
 
@@ -457,7 +405,7 @@ def test_body():
     template_impact = [
         "### Impact",
         "",
-        "_Describe any public API or user-facing feature change or any 
performance impact._",
+        "<!-- Describe any public API or user-facing feature change or any 
performance impact. -->",
         ""
     ]
 
@@ -465,9 +413,10 @@ def test_body():
     good_impact[1] = "impact description"
 
     template_risklevel = [
-        "### Risk level (write none, low medium or high below)",
+        "### Risk Level",
         "",
-        "_If medium or high, explain what verification was done to mitigate 
the risks._",
+        "<!-- Accepted values: none, low, medium or high. Other than `none`, 
explain the risk.",
+        "     If medium or high, explain what verification was done to 
mitigate the risks. -->",
         ""
     ]
 
@@ -477,12 +426,11 @@ def test_body():
     template_docs_update = [
         "### Documentation Update",
         "",
-        "_Describe any necessary documentation update if there is any new 
feature, config, or user-facing change_",
+        "<!-- Describe any necessary documentation update if there is any new 
feature, config, or user-facing change. If not, put \"none\".",
         "",
-        "- _The config description must be updated if new configs are added or 
the default value of the configs are changed. If not, put \"none\"._",
-        "- _Any new feature or user-facing change requires updating the Hudi 
website. Please create a Jira ticket, attach the",
-        "  ticket number here and follow the 
[instruction](https://hudi.apache.org/contribute/developer-setup#website) to 
make",
-        "  changes to the website._",
+        "- The config description must be updated if new configs are added or 
the default value of the configs are changed.",
+        "- Any new feature or user-facing change requires updating the Hudi 
website. Please follow the",
+        "  
[instruction](https://hudi.apache.org/contribute/developer-setup#website) to 
make changes to the website. -->",
         ""
     ]
 
@@ -493,16 +441,17 @@ def test_body():
         "### Contributor's checklist",
         "",
         "- [ ] Read through [contributor's 
guide](https://hudi.apache.org/contribute/how-to-contribute)",
-        "- [ ] Change Logs and Impact were stated clearly",
-        "- [ ] Adequate tests were added if applicable",
-        "- [ ] CI passed"
+        "- [ ] Enough context is provided in the sections above",
+        "- [ ] Adequate tests were added if applicable"
     ]
 
-    #list of sections that when combined form a valid body
-    good_sections = [good_changelogs, good_impact, good_risklevel, 
good_docs_update, template_checklist]
+    # list of sections that when combined from a valid body
+    good_sections = [good_problem_statement, good_changelogs, good_impact, 
good_risklevel, good_docs_update,
+                     template_checklist]
 
-    #list of sections that when combined form the template
-    template_sections = [template_changelogs, template_impact, 
template_risklevel, template_docs_update, template_checklist]
+    # list of sections that when combined from the template
+    template_sections = [template_problem_statement, template_changelogs, 
template_impact, template_risklevel,
+                         template_docs_update, template_checklist]
 
     tests_passed = True
     #Test section not filled out
@@ -524,7 +473,7 @@ def test_body():
             if j == i:
                 test_sections.append(good_sections[j].copy())
         tests_passed = run_test(f"duplicate section: {i}", 
build_body(test_sections), False, DEBUG_MESSAGES) and tests_passed
-    
+
     #Test out of order section
     for i in range(len(good_sections)-1):
         test_sections = []
@@ -541,25 +490,59 @@ def test_body():
             if i != j:
                 test_sections.append(good_sections[j].copy())
         tests_passed = run_test(f"Missing Section: {i}", 
build_body(test_sections), False, DEBUG_MESSAGES) and tests_passed
-    
+
     #Test good body:
     tests_passed = run_test("good documentation", build_body(good_sections), 
True, DEBUG_MESSAGES) and tests_passed
 
+    # Test fix PR issue link requirement
+    fix_changelogs_with_issue = good_changelogs.copy()
+    fix_changelogs_with_issue[1] = "Fixed bug. Fixes #123"
+    fix_sections_with_issue = [good_problem_statement, 
fix_changelogs_with_issue, good_impact, good_risklevel,
+                               good_docs_update,
+                               template_checklist]
+    tests_passed = run_test("fix PR with issue link", 
build_body(fix_sections_with_issue), True,
+                            DEBUG_MESSAGES,
+                            "fix: fix bug") and tests_passed
+    tests_passed = run_test("fix PR with scope and issue link", 
build_body(fix_sections_with_issue), True,
+                            DEBUG_MESSAGES,
+                            "fix(index): fix bug") and tests_passed
+
+    fix_changelogs_with_github_url = good_changelogs.copy()
+    fix_changelogs_with_github_url[1] = "Fixed bug. See 
https://github.com/apache/hudi/issues/123";
+    fix_sections_with_github_url = [good_problem_statement, 
fix_changelogs_with_github_url, good_impact, good_risklevel,
+                                    good_docs_update,
+                                    template_checklist]
+    tests_passed = run_test("fix PR with GitHub URL", 
build_body(fix_sections_with_github_url), True,
+                            DEBUG_MESSAGES,
+                            "fix: fix bug") and tests_passed
+
+    fix_changelogs_without_issue = good_changelogs.copy()
+    fix_changelogs_without_issue[1] = "Fixed bug without issue reference"
+    fix_sections_without_issue = [good_problem_statement, 
fix_changelogs_without_issue, good_impact, good_risklevel,
+                                  good_docs_update,
+                                  template_checklist]
+    tests_passed = run_test("fix PR without issue link", 
build_body(fix_sections_without_issue), True,
+                            DEBUG_MESSAGES,
+                            "fix: fix bug") and tests_passed
+
+    # Non-fix PRs should not require issue links
+    tests_passed = run_test("feat PR without issue link", 
build_body(good_sections), True, DEBUG_MESSAGES,
+                            "feat: new feature") and tests_passed
+
     print("*****")
     if tests_passed:
         print("All body tests passed")
     else:
         print("Some body tests failed")
     print("*****")
-    
+
     return tests_passed
 
 
 if __name__ == '__main__':
     if len(sys.argv) > 1:
-        title_tests = test_title()
         body_tests = test_body()
-        if title_tests and body_tests:
+        if body_tests:
             exit(0)
         else:
             exit(-1)
@@ -572,15 +555,11 @@ if __name__ == '__main__':
         print("no title")
         exit(-1)
 
-    if not validate_title(title):
-        print("invalid title")
-        exit(-1)
-
     if body is None:
         print("no pr body")
         exit(-1)
 
-    validator = make_default_validator(body,True)
-    if not validator.validate():
+    validator = make_default_validator(body, True)
+    if not validator.validate(title):
         exit(-1)
     exit(0)

Reply via email to