This is an automated email from the ASF dual-hosted git repository.
HyukjinKwon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 61672076ffd0 [SPARK-56692][INFRA] Check release date of upgrade of pom
dependency
61672076ffd0 is described below
commit 61672076ffd0e728c554fc2890fe343f22a88e27
Author: Tian Gao <[email protected]>
AuthorDate: Sun May 10 19:01:54 2026 +0900
[SPARK-56692][INFRA] Check release date of upgrade of pom dependency
### What changes were proposed in this pull request?
According to our discussion in dev channel, we want to have a 7-day buffer
time between a certain 3rd party package is released and we upgrade the
dependency. This is the CI check to prevent PRs that break the policy.
Basically if `pom.xml` is modified, the test script will search for the
upgraded version, then check the maven registry about it's modified time, then
raise an exception if it's too early.
### Why are the changes needed?
To prevent accidentally upgraded dependencies.
### Does this PR introduce _any_ user-facing change?
No, it's infra only.
### How was this patch tested?
Locally tested that it works. I intentionally upgraded `fasterxml.jackson`
which was released 2 days ago - the CI should be able to catch it.
It also means this PR should **not** be merged before this change is
reverted.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #55641 from gaogaotiantian/check-pom-dependency.
Authored-by: Tian Gao <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
dev/run-tests.py | 9 ++++
dev/sparktestsupport/utils.py | 98 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 107 insertions(+)
diff --git a/dev/run-tests.py b/dev/run-tests.py
index 685621193dd6..35d2a9ca2e9b 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -28,6 +28,7 @@ from contextlib import contextmanager
from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
from sparktestsupport.shellutils import exit_from_command_with_retcode,
run_cmd, rm_r, which
from sparktestsupport.utils import (
+ check_upgraded_pom_dependencies,
determine_dangling_python_tests,
determine_modules_for_files,
determine_modules_to_test,
@@ -559,10 +560,18 @@ def main():
changed_files = identify_changed_files_from_git_commits(
"HEAD", target_ref=os.environ["APACHE_SPARK_REF"]
)
+ if "pom.xml" in changed_files:
+ check_upgraded_pom_dependencies(
+ os.environ["GITHUB_SHA"],
target_ref=os.environ["APACHE_SPARK_REF"]
+ )
elif is_github_prev_sha:
changed_files = identify_changed_files_from_git_commits(
os.environ["GITHUB_SHA"],
target_ref=os.environ["GITHUB_PREV_SHA"]
)
+ if "pom.xml" in changed_files:
+ check_upgraded_pom_dependencies(
+ os.environ["GITHUB_SHA"],
target_ref=os.environ["GITHUB_PREV_SHA"]
+ )
dangling_python_tests =
determine_dangling_python_tests(changed_files)
if dangling_python_tests:
diff --git a/dev/sparktestsupport/utils.py b/dev/sparktestsupport/utils.py
index b969b96a16c5..c1f07b534dc0 100755
--- a/dev/sparktestsupport/utils.py
+++ b/dev/sparktestsupport/utils.py
@@ -18,6 +18,7 @@
#
import os
+import re
import sys
import subprocess
from sparktestsupport import modules
@@ -97,6 +98,103 @@ def identify_changed_files_from_git_commits(patch_sha,
target_branch=None, targe
return [f for f in raw_output.split("\n") if f]
+def check_upgraded_pom_dependencies(
+ patch_sha, target_branch=None, target_ref=None, buffer_days=7, verbose=True
+):
+ """
+ Check whether the pom.xml dependency upgrade has been released at least
`buffer_days` days ago.
+
+ Raise ValueError if the dependency is released within the last
`buffer_days` days.
+ """
+
+ def get_release_timestamp(group_id, artifact_id, version):
+ import urllib.request
+ from email.utils import parsedate_to_datetime
+
+ host = os.environ.get(
+ "MAVEN_MIRROR_URL",
"https://maven-central.storage-download.googleapis.com/maven2"
+ )
+ url = f"{host}/{group_id.replace('.',
'/')}/{artifact_id}/{version}/{artifact_id}-{version}.pom"
+ req = urllib.request.Request(url, method="HEAD")
+ try:
+ with urllib.request.urlopen(req) as response:
+ return
parsedate_to_datetime(response.headers.get("Last-Modified")).timestamp()
+ except Exception:
+ return None
+
+ if target_branch is None and target_ref is None:
+ raise AttributeError("must specify either target_branch or target_ref")
+ elif target_branch is not None and target_ref is not None:
+ raise AttributeError("must specify either target_branch or target_ref,
not both")
+ if target_branch is not None:
+ diff_target = target_branch
+ run_cmd(["git", "fetch", "origin", str(target_branch + ":" +
target_branch)])
+ else:
+ diff_target = target_ref
+ # The correct grammar is git diff <old> <new>, but
identify_changed_files_from_git_commits
+ # uses it differently. It doesn't matter for that function because it only
needs the file
+ # name, but we need to know which change is "new" to locate the new
version.
+ raw_output = subprocess.check_output(
+ ["git", "diff", diff_target, patch_sha, ":(top)pom.xml"],
universal_newlines=True
+ )
+
+ changed_versions = []
+
+ # "+ <oro.version>2.0.9</oro.version>" -> "oro.version", "2.0.9"
+ new_version_regex =
r"^\+\s*<(?P<dependency>.*?\.version)>(?P<version>.*?)</.*?>"
+ for line in raw_output.split("\n"):
+ if match := re.match(new_version_regex, line):
+ changed_versions.append((match.group("dependency"),
match.group("version")))
+
+ if changed_versions:
+ # Okay now we parse the pom.xml to find the real dependency name
+ import datetime
+ import xml.etree.ElementTree as ET
+
+ if verbose:
+ print("Changed version in pom.xml detected:")
+ for dep, ver in changed_versions:
+ print(f" {dep}: {ver}")
+
+ root_dir = os.path.join(os.path.dirname(__file__), "..", "..")
+ pom_path = os.path.join(root_dir, "pom.xml")
+ tree = ET.parse(pom_path)
+ root = tree.getroot()
+ namespace = re.match(r"\{(.*?)\}project", root.tag).group(1)
+ ns = {"m": namespace}
+ for dependency in root.findall(".//m:dependency", ns):
+ group_id = dependency.find("m:groupId", ns).text
+ artifact_id = dependency.find("m:artifactId", ns).text
+ version = dependency.find("m:version", ns)
+ if version is not None:
+ version = version.text
+
+ for dep, ver in changed_versions:
+ template = "${" + dep + "}"
+ if version is not None and template in version:
+ version = version.replace("${" + dep + "}", ver)
+ elif template in artifact_id:
+ artifact_id = artifact_id.replace("${" + dep + "}", ver)
+ else:
+ # If we can't find the related upgrade version, just skip
+ continue
+ release_timestamp = get_release_timestamp(group_id,
artifact_id, version)
+ if release_timestamp is None:
+ raise ValueError(
+ f"Could not find release date for
{group_id}:{artifact_id}:{version}"
+ )
+
+ release_date =
datetime.datetime.fromtimestamp(release_timestamp).date()
+ if verbose:
+ print(f" {group_id}:{artifact_id}:{version} released on
{release_date}")
+ if release_date > datetime.datetime.now().date() -
datetime.timedelta(
+ days=buffer_days
+ ):
+ raise ValueError(
+ f"Dependency {group_id}:{artifact_id}:{version} is
released within the last {buffer_days} days"
+ )
+
+
def determine_modules_to_test(changed_modules, deduplicated=True):
"""
Given a set of modules that have changed, compute the transitive closure
of those modules'
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]