This is an automated email from the ASF dual-hosted git repository.

HyukjinKwon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 61672076ffd0 [SPARK-56692][INFRA] Check release date of upgrade of pom 
dependency
61672076ffd0 is described below

commit 61672076ffd0e728c554fc2890fe343f22a88e27
Author: Tian Gao <[email protected]>
AuthorDate: Sun May 10 19:01:54 2026 +0900

    [SPARK-56692][INFRA] Check release date of upgrade of pom dependency
    
    ### What changes were proposed in this pull request?
    
    According to our discussion in dev channel, we want to have a 7-day buffer 
time between a certain 3rd party package is released and we upgrade the 
dependency. This is the CI check to prevent PRs that break the policy.
    
    Basically if `pom.xml` is modified, the test script will search for the 
upgraded version, then check the maven registry about it's modified time, then 
raise an exception if it's too early.
    
    ### Why are the changes needed?
    
    To prevent accidentally upgraded dependencies.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No, it's infra only.
    
    ### How was this patch tested?
    
    Locally tested that it works. I intentionally upgraded `fasterxml.jackson` 
which was released 2 days ago - the CI should be able to catch it.
    
    It also means this PR should **not** be merged before this change is 
reverted.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #55641 from gaogaotiantian/check-pom-dependency.
    
    Authored-by: Tian Gao <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 dev/run-tests.py              |  9 ++++
 dev/sparktestsupport/utils.py | 98 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 685621193dd6..35d2a9ca2e9b 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -28,6 +28,7 @@ from contextlib import contextmanager
 from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
 from sparktestsupport.shellutils import exit_from_command_with_retcode, 
run_cmd, rm_r, which
 from sparktestsupport.utils import (
+    check_upgraded_pom_dependencies,
     determine_dangling_python_tests,
     determine_modules_for_files,
     determine_modules_to_test,
@@ -559,10 +560,18 @@ def main():
                 changed_files = identify_changed_files_from_git_commits(
                     "HEAD", target_ref=os.environ["APACHE_SPARK_REF"]
                 )
+                if "pom.xml" in changed_files:
+                    check_upgraded_pom_dependencies(
+                        os.environ["GITHUB_SHA"], 
target_ref=os.environ["APACHE_SPARK_REF"]
+                    )
             elif is_github_prev_sha:
                 changed_files = identify_changed_files_from_git_commits(
                     os.environ["GITHUB_SHA"], 
target_ref=os.environ["GITHUB_PREV_SHA"]
                 )
+                if "pom.xml" in changed_files:
+                    check_upgraded_pom_dependencies(
+                        os.environ["GITHUB_SHA"], 
target_ref=os.environ["GITHUB_PREV_SHA"]
+                    )
 
             dangling_python_tests = 
determine_dangling_python_tests(changed_files)
             if dangling_python_tests:
diff --git a/dev/sparktestsupport/utils.py b/dev/sparktestsupport/utils.py
index b969b96a16c5..c1f07b534dc0 100755
--- a/dev/sparktestsupport/utils.py
+++ b/dev/sparktestsupport/utils.py
@@ -18,6 +18,7 @@
 #
 
 import os
+import re
 import sys
 import subprocess
 from sparktestsupport import modules
@@ -97,6 +98,103 @@ def identify_changed_files_from_git_commits(patch_sha, 
target_branch=None, targe
     return [f for f in raw_output.split("\n") if f]
 
 
+def check_upgraded_pom_dependencies(
+    patch_sha, target_branch=None, target_ref=None, buffer_days=7, verbose=True
+):
+    """
+    Check whether the pom.xml dependency upgrade has been released at least 
`buffer_days` days ago.
+
+    Raise ValueError if the dependency is released within the last 
`buffer_days` days.
+    """
+
+    def get_release_timestamp(group_id, artifact_id, version):
+        import urllib.request
+        from email.utils import parsedate_to_datetime
+
+        host = os.environ.get(
+            "MAVEN_MIRROR_URL", 
"https://maven-central.storage-download.googleapis.com/maven2";
+        )
+        url = f"{host}/{group_id.replace('.', 
'/')}/{artifact_id}/{version}/{artifact_id}-{version}.pom"
+        req = urllib.request.Request(url, method="HEAD")
+        try:
+            with urllib.request.urlopen(req) as response:
+                return 
parsedate_to_datetime(response.headers.get("Last-Modified")).timestamp()
+        except Exception:
+            return None
+
+    if target_branch is None and target_ref is None:
+        raise AttributeError("must specify either target_branch or target_ref")
+    elif target_branch is not None and target_ref is not None:
+        raise AttributeError("must specify either target_branch or target_ref, 
not both")
+    if target_branch is not None:
+        diff_target = target_branch
+        run_cmd(["git", "fetch", "origin", str(target_branch + ":" + 
target_branch)])
+    else:
+        diff_target = target_ref
+    # The correct grammar is git diff <old> <new>, but 
identify_changed_files_from_git_commits
+    # uses it differently. It doesn't matter for that function because it only 
needs the file
+    # name, but we need to know which change is "new" to locate the new 
version.
+    raw_output = subprocess.check_output(
+        ["git", "diff", diff_target, patch_sha, ":(top)pom.xml"], 
universal_newlines=True
+    )
+
+    changed_versions = []
+
+    # "+    <oro.version>2.0.9</oro.version>" -> "oro.version", "2.0.9"
+    new_version_regex = 
r"^\+\s*<(?P<dependency>.*?\.version)>(?P<version>.*?)</.*?>"
+    for line in raw_output.split("\n"):
+        if match := re.match(new_version_regex, line):
+            changed_versions.append((match.group("dependency"), 
match.group("version")))
+
+    if changed_versions:
+        # Okay now we parse the pom.xml to find the real dependency name
+        import datetime
+        import xml.etree.ElementTree as ET
+
+        if verbose:
+            print("Changed version in pom.xml detected:")
+            for dep, ver in changed_versions:
+                print(f"  {dep}: {ver}")
+
+        root_dir = os.path.join(os.path.dirname(__file__), "..", "..")
+        pom_path = os.path.join(root_dir, "pom.xml")
+        tree = ET.parse(pom_path)
+        root = tree.getroot()
+        namespace = re.match(r"\{(.*?)\}project", root.tag).group(1)
+        ns = {"m": namespace}
+        for dependency in root.findall(".//m:dependency", ns):
+            group_id = dependency.find("m:groupId", ns).text
+            artifact_id = dependency.find("m:artifactId", ns).text
+            version = dependency.find("m:version", ns)
+            if version is not None:
+                version = version.text
+
+            for dep, ver in changed_versions:
+                template = "${" + dep + "}"
+                if version is not None and template in version:
+                    version = version.replace("${" + dep + "}", ver)
+                elif template in artifact_id:
+                    artifact_id = artifact_id.replace("${" + dep + "}", ver)
+                else:
+                    # If we can't find the related upgrade version, just skip
+                    continue
+                release_timestamp = get_release_timestamp(group_id, 
artifact_id, version)
+                if release_timestamp is None:
+                    raise ValueError(
+                        f"Could not find release date for 
{group_id}:{artifact_id}:{version}"
+                    )
+
+                release_date = 
datetime.datetime.fromtimestamp(release_timestamp).date()
+                if verbose:
+                    print(f"  {group_id}:{artifact_id}:{version} released on 
{release_date}")
+                if release_date > datetime.datetime.now().date() - 
datetime.timedelta(
+                    days=buffer_days
+                ):
+                    raise ValueError(
+                        f"Dependency {group_id}:{artifact_id}:{version} is 
released within the last {buffer_days} days"
+                    )
+
+
 def determine_modules_to_test(changed_modules, deduplicated=True):
     """
     Given a set of modules that have changed, compute the transitive closure 
of those modules'


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to