This is an automated email from the ASF dual-hosted git repository.
assignuser pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 4937cf5721 GH-46075: [Release][CI] Fix binary verification (#46076)
4937cf5721 is described below
commit 4937cf5721bd4912438964377361c4ec49fd5e80
Author: Jacob Wujciak-Jens <[email protected]>
AuthorDate: Tue Apr 15 17:27:09 2025 +0200
GH-46075: [Release][CI] Fix binary verification (#46076)
### Rationale for this change
### What changes are included in this PR?
### Are these changes tested?
### Are there any user-facing changes?
**This PR includes breaking changes to public APIs.** (If there are any
breaking changes to public APIs, please explain which changes are breaking. If
not, you can remove this.)
**This PR contains a "Critical Fix".** (If the changes fix either (a) a
security vulnerability, (b) a bug that caused incorrect or invalid data to be
produced, or (c) a bug that causes a crash (even when the API contract is
upheld), please provide explanation. If not, you can remove this.)
* GitHub Issue: #46075
Authored-by: Jacob Wujciak-Jens <[email protected]>
Signed-off-by: Jacob Wujciak-Jens <[email protected]>
---
.github/workflows/verify_rc.yml | 44 ++++--
dev/release/download_rc_binaries.py | 178 ++++++++++++++----------
dev/release/verify-release-candidate-wheels.bat | 31 ++---
dev/release/verify-release-candidate.sh | 10 +-
4 files changed, 155 insertions(+), 108 deletions(-)
diff --git a/.github/workflows/verify_rc.yml b/.github/workflows/verify_rc.yml
index fe46ae6f23..dceb04a492 100644
--- a/.github/workflows/verify_rc.yml
+++ b/.github/workflows/verify_rc.yml
@@ -21,6 +21,16 @@ on:
push:
tags:
- "*-rc*"
+ pull_request:
+ paths:
+ - ".github/workflows/verify_rc.yml"
+ workflow_dispatch:
+ inputs:
+ rc_tag:
+ description: "Tag of the rc to verify"
+ type: string
+ required: true
+
permissions:
contents: read
@@ -28,6 +38,7 @@ permissions:
env:
TEST_DEFAULT: "0"
VERBOSE: "1"
+ RC_TAG: "${{ inputs.rc_tag || github.event_name == 'pull_request' &&
'apache-arrow-20.0.0-rc0' || github.ref_name }}"
jobs:
apt:
@@ -46,9 +57,9 @@ jobs:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 #
v4.2.2
- name: Run
run: |
- package_id=${GITHUB_REF_NAME%-rc*}
+ package_id=${RC_TAG%-rc*}
version=${package_id#apache-arrow-}
- rc=${GITHUB_REF_NAME#*-rc}
+ rc=${RC_TAG#*-rc}
dev/release/verify-release-candidate.sh ${version} ${rc}
binary:
@@ -61,9 +72,9 @@ jobs:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 #
v4.2.2
- name: Run
run: |
- package_id=${GITHUB_REF_NAME%-rc*}
+ package_id=${RC_TAG%-rc*}
version=${package_id#apache-arrow-}
- rc=${GITHUB_REF_NAME#*-rc}
+ rc=${RC_TAG#*-rc}
dev/release/verify-release-candidate.sh ${version} ${rc}
wheels-linux:
@@ -89,9 +100,9 @@ jobs:
run: python -m pip install -e dev/archery[docker]
- name: Prepare
run: |
- package_id=${GITHUB_REF_NAME%-rc*}
+ package_id=${RC_TAG%-rc*}
echo "VERSION=${package_id#apache-arrow-}" >> ${GITHUB_ENV}
- echo "RC=${GITHUB_REF_NAME#*-rc}" >> ${GITHUB_ENV}
+ echo "RC=${RC_TAG#*-rc}" >> ${GITHUB_ENV}
distro=${{ matrix.distro }}
if [ "${distro}" = "conda" ]; then
echo "SERVICE=${distro}-verify-rc" >> ${GITHUB_ENV}
@@ -102,6 +113,8 @@ jobs:
echo "$(echo ${os} | tr a-z A-Z)=${version}" >> ${GITHUB_ENV}
fi
- name: Run
+ env:
+ GH_TOKEN: ${{ github.token }}
run: |
archery docker run \
-e TEST_DEFAULT="${TEST_DEFAULT}" \
@@ -109,6 +122,7 @@ jobs:
-e VERBOSE="${VERBOSE}" \
-e VERIFY_RC="${RC}" \
-e VERIFY_VERSION="${VERSION}" \
+ -e GH_TOKEN="$GH_TOKEN" \
${SERVICE}
wheels-macos:
@@ -126,10 +140,12 @@ jobs:
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 #
v4.2.2
- name: Run
+ env:
+ GH_TOKEN: ${{ github.token }}
run: |
- package_id=${GITHUB_REF_NAME%-rc*}
+ package_id=${RC_TAG%-rc*}
version=${package_id#apache-arrow-}
- rc=${GITHUB_REF_NAME#*-rc}
+ rc=${RC_TAG#*-rc}
dev/release/verify-release-candidate.sh ${version} ${rc}
wheels-windows:
@@ -141,12 +157,14 @@ jobs:
TEST_WHEELS: "1"
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 #
v4.2.2
+ with:
+ submodules: recursive
- name: Prepare
shell: bash
run: |
- package_id=${GITHUB_REF_NAME%-rc*}
+ package_id=${RC_TAG%-rc*}
echo "VERSION=${package_id#apache-arrow-}" >> ${GITHUB_ENV}
- echo "RC=${GITHUB_REF_NAME#*-rc}" >> ${GITHUB_ENV}
+ echo "RC=${RC_TAG#*-rc}" >> ${GITHUB_ENV}
- uses:
conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 #
v3.1.1
- name: Install System Dependencies
run: |
@@ -156,6 +174,8 @@ jobs:
shell: bash
run: ci/scripts/download_tz_database.sh
- name: Run verification
+ env:
+ GH_TOKEN: ${{ github.token }}
shell: cmd
run: |
dev/release/verify-release-candidate-wheels.bat %VERSION% %RC%
@@ -176,7 +196,7 @@ jobs:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 #
v4.2.2
- name: Run
run: |
- package_id=${GITHUB_REF_NAME%-rc*}
+ package_id=${RC_TAG%-rc*}
version=${package_id#apache-arrow-}
- rc=${GITHUB_REF_NAME#*-rc}
+ rc=${RC_TAG#*-rc}
dev/release/verify-release-candidate.sh ${version} ${rc}
diff --git a/dev/release/download_rc_binaries.py
b/dev/release/download_rc_binaries.py
index 3bc0012116..01f6588c6d 100755
--- a/dev/release/download_rc_binaries.py
+++ b/dev/release/download_rc_binaries.py
@@ -28,7 +28,6 @@ import subprocess
import time
import urllib.request
-
DEFAULT_PARALLEL_DOWNLOADS = 8
@@ -36,33 +35,31 @@ class Downloader:
def get_file_list(self, prefix, filter=None):
def traverse(directory, files, directories):
- url = f'{self.URL_ROOT}/{directory}'
+ url = f"{self.URL_ROOT}/{directory}"
response = urllib.request.urlopen(url).read().decode()
paths = re.findall('<a href="(.+?)"', response)
for path in paths:
- path = re.sub(f'^{re.escape(url)}',
- '',
- path)
- if path == '../':
+ path = re.sub(f"^{re.escape(url)}", "", path)
+ if path == "../":
continue
- resolved_path = f'{directory}{path}'
+ resolved_path = f"{directory}{path}"
if filter and not filter(path):
continue
- if path.endswith('/'):
+ if path.endswith("/"):
directories.append(resolved_path)
else:
files.append(resolved_path)
+
files = []
- if prefix != '' and not prefix.endswith('/'):
- prefix += '/'
+ if prefix != "" and not prefix.endswith("/"):
+ prefix += "/"
directories = [prefix]
while len(directories) > 0:
directory = directories.pop()
traverse(directory, files, directories)
return files
- def download_files(self, files, dest=None, num_parallel=None,
- re_match=None):
+ def download_files(self, files, dest=None, num_parallel=None,
re_match=None):
"""
Download files from Bintray in parallel. If file already exists, will
overwrite if the checksum does not match what Bintray says it should be
@@ -83,19 +80,21 @@ class Downloader:
num_parallel = DEFAULT_PARALLEL_DOWNLOADS
if re_match is not None:
- regex = re.compile(re_match)
- files = [x for x in files if regex.match(x)]
+ files = self._filter_files(files, re_match)
if num_parallel == 1:
for path in files:
self._download_file(dest, path)
else:
parallel_map_terminate_early(
- functools.partial(self._download_file, dest),
- files,
- num_parallel
+ functools.partial(self._download_file,
+ dest), files, num_parallel
)
+ def _filter_files(self, files, re_match):
+ regex = re.compile(re_match)
+ return [x for x in files if regex.match(x)]
+
def _download_file(self, dest, path):
base, filename = os.path.split(path)
@@ -106,7 +105,7 @@ class Downloader:
print("Downloading {} to {}".format(path, dest_path))
- url = f'{self.URL_ROOT}/{path}'
+ url = f"{self.URL_ROOT}/{path}"
self._download_url(url, dest_path)
def _download_url(self, url, dest_path, *, extra_args=None):
@@ -128,8 +127,8 @@ class Downloader:
delay = attempt * 3
print(f"Waiting {delay} seconds before retrying {url}")
time.sleep(delay)
- proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
+ proc = subprocess.Popen(
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = proc.communicate()
if proc.returncode != 0:
try:
@@ -142,8 +141,9 @@ class Downloader:
break
else:
return
- raise Exception(f"Downloading {url} failed\n"
- f"stdout: {stdout}\nstderr: {stderr}")
+ raise Exception(
+ f"Downloading {url} failed\n" f"stdout: {stdout}\nstderr: {stderr}"
+ )
def _curl_version(self):
cmd = ["curl", "--version"]
@@ -157,8 +157,10 @@ class Artifactory(Downloader):
class Maven(Downloader):
- URL_ROOT = "https://repository.apache.org" + \
- "/content/repositories/staging/org/apache/arrow"
+ URL_ROOT = (
+ "https://repository.apache.org"
+ + "/content/repositories/staging/org/apache/arrow"
+ )
class GitHub(Downloader):
@@ -174,8 +176,10 @@ class GitHub(Downloader):
self._token = os.environ.get("GH_TOKEN")
def get_file_list(self, prefix, filter=None):
- url = (f"https://api.github.com/repos/{self._repository}/"
- f"releases/tags/{self._tag}")
+ url = (
+ f"https://api.github.com/repos/{self._repository}/"
+ f"releases/tags/{self._tag}"
+ )
print("Fetching release from", url)
headers = {
"Accept": "application/vnd.github+json",
@@ -204,6 +208,10 @@ class GitHub(Downloader):
files.append((asset["name"], url))
return files
+ def _filter_files(self, files, re_match):
+ regex = re.compile(re_match)
+ return [x for x in files if regex.match(x[0])]
+
def _download_file(self, dest, asset):
name, url = asset
@@ -226,11 +234,7 @@ class GitHub(Downloader):
if self._curl_version() >= (7, 71, 0):
# Also retry 403s
extra_args.append("--retry-all-errors")
- self._download_url(
- url,
- dest_path,
- extra_args=extra_args
- )
+ self._download_url(url, dest_path, extra_args=extra_args)
def parallel_map_terminate_early(f, iterable, num_parallel):
@@ -248,38 +252,45 @@ def parallel_map_terminate_early(f, iterable,
num_parallel):
ARROW_REPOSITORY_PACKAGE_TYPES = [
- 'almalinux',
- 'amazon-linux',
- 'centos',
- 'debian',
- 'ubuntu',
+ "almalinux",
+ "amazon-linux",
+ "centos",
+ "debian",
+ "ubuntu",
]
-ARROW_STANDALONE_PACKAGE_TYPES = ['nuget', 'python']
-ARROW_PACKAGE_TYPES = \
- ARROW_REPOSITORY_PACKAGE_TYPES + \
- ARROW_STANDALONE_PACKAGE_TYPES
-
-
-def download_rc_binaries(version, rc_number, re_match=None, dest=None,
- num_parallel=None, target_package_type=None,
- repository=None, tag=None):
- version_string = '{}-rc{}'.format(version, rc_number)
- version_pattern = re.compile(r'\d+\.\d+\.\d+')
+ARROW_STANDALONE_PACKAGE_TYPES = ["nuget", "python"]
+ARROW_PACKAGE_TYPES = ARROW_REPOSITORY_PACKAGE_TYPES +
ARROW_STANDALONE_PACKAGE_TYPES
+
+
+def download_rc_binaries(
+ version,
+ rc_number,
+ re_match=None,
+ dest=None,
+ num_parallel=None,
+ target_package_type=None,
+ repository=None,
+ tag=None,
+):
+ version_string = "{}-rc{}".format(version, rc_number)
+ version_pattern = re.compile(r"\d+\.\d+\.\d+")
if target_package_type:
package_types = [target_package_type]
else:
package_types = ARROW_PACKAGE_TYPES
for package_type in package_types:
+
def is_target(path):
match = version_pattern.search(path)
if not match:
return True
return match[0] == version
+
filter = is_target
- if package_type == 'github' or package_type == 'nuget':
+ if package_type == "github" or package_type in
ARROW_STANDALONE_PACKAGE_TYPES:
downloader = GitHub(repository, tag)
- prefix = ''
+ prefix = ""
filter = None
elif package_type in ARROW_REPOSITORY_PACKAGE_TYPES:
downloader = Artifactory()
@@ -289,33 +300,56 @@ def download_rc_binaries(version, rc_number,
re_match=None, dest=None,
prefix = f'{package_type}-rc/{version_string}'
filter = None
files = downloader.get_file_list(prefix, filter=filter)
- downloader.download_files(files, re_match=re_match, dest=dest,
- num_parallel=num_parallel)
+ downloader.download_files(
+ files, re_match=re_match, dest=dest, num_parallel=num_parallel
+ )
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser(
- description='Download release candidate binaries'
+ description="Download release candidate binaries")
+ parser.add_argument("version", type=str, help="The version number")
+ parser.add_argument(
+ "rc_number", type=int, help="The release candidate number, e.g. 0, 1,
etc"
+ )
+ parser.add_argument(
+ "-e",
+ "--regexp",
+ type=str,
+ default=None,
+ help=(
+ "Regular expression to match on file names "
+ "to only download certain files"
+ ),
+ )
+ parser.add_argument(
+ "--dest",
+ type=str,
+ default=os.getcwd(),
+ help="The output folder for the downloaded files",
+ )
+ parser.add_argument(
+ "--num_parallel",
+ type=int,
+ default=DEFAULT_PARALLEL_DOWNLOADS,
+ help="The number of concurrent downloads to do",
+ )
+ parser.add_argument(
+ "--package_type",
+ type=str,
+ default=None,
+ help="The package type to be downloaded",
+ )
+ parser.add_argument(
+ "--repository",
+ type=str,
+ help=("The repository to pull from " "(required if
--package_type=github)"),
+ )
+ parser.add_argument(
+ "--tag",
+ type=str,
+ help=("The release tag to download " "(required if
--package_type=github)"),
)
- parser.add_argument('version', type=str, help='The version number')
- parser.add_argument('rc_number', type=int,
- help='The release candidate number, e.g. 0, 1, etc')
- parser.add_argument('-e', '--regexp', type=str, default=None,
- help=('Regular expression to match on file names '
- 'to only download certain files'))
- parser.add_argument('--dest', type=str, default=os.getcwd(),
- help='The output folder for the downloaded files')
- parser.add_argument('--num_parallel', type=int,
- default=DEFAULT_PARALLEL_DOWNLOADS,
- help='The number of concurrent downloads to do')
- parser.add_argument('--package_type', type=str, default=None,
- help='The package type to be downloaded')
- parser.add_argument('--repository', type=str,
- help=('The repository to pull from '
- '(required if --package_type=github)'))
- parser.add_argument('--tag', type=str,
- help=('The release tag to download '
- '(required if --package_type=github)'))
args = parser.parse_args()
download_rc_binaries(
diff --git a/dev/release/verify-release-candidate-wheels.bat
b/dev/release/verify-release-candidate-wheels.bat
index a9a4703fae..e41d2dbf25 100644
--- a/dev/release/verify-release-candidate-wheels.bat
+++ b/dev/release/verify-release-candidate-wheels.bat
@@ -33,29 +33,18 @@ if not exist %_VERIFICATION_DIR% mkdir %_VERIFICATION_DIR%
cd %_VERIFICATION_DIR%
-@rem clone Arrow repository to obtain test requirements
-set GIT_ENV_PATH=%_VERIFICATION_DIR%\_git
-call conda create -p %GIT_ENV_PATH% ^
- --no-shortcuts -f -q -y git ^
- || EXIT /B 1
-call activate %GIT_ENV_PATH%
-
-git clone https://github.com/apache/arrow.git || EXIT /B 1
-pushd arrow
-git submodule update --init
-popd
-
set ARROW_VERSION=%1
set RC_NUMBER=%2
-python arrow\dev\release\download_rc_binaries.py %ARROW_VERSION% %RC_NUMBER% ^
- --package_type python ^
+python dev\release\download_rc_binaries.py %ARROW_VERSION% %RC_NUMBER% ^
+ --package_type="python" ^
+ --repository="apache/arrow" ^
+ --dest="%_VERIFICATION_DIR%" ^
+ --tag="apache-arrow-%ARROW_VERSION%-rc%RC_NUMBER%" ^
--regex=".*win_amd64.*" || EXIT /B 1
-call deactivate
-
-set ARROW_TEST_DATA=%cd%\arrow\testing\data
-set PARQUET_TEST_DATA=%cd%\arrow\cpp\submodules\parquet-testing\data
+set ARROW_TEST_DATA=%cd%\testing\data
+set PARQUET_TEST_DATA=%cd%\cpp\submodules\parquet-testing\data
CALL :verify_wheel 3.9
@@ -99,13 +88,13 @@ call activate %CONDA_ENV_PATH%
set
WHEEL_FILENAME=pyarrow-%ARROW_VERSION%-cp%PY_VERSION_NO_PERIOD%-cp%PY_VERSION_NO_PERIOD%%ABI_TAG%-win_amd64.whl
-pip install python-rc\%ARROW_VERSION%-rc%RC_NUMBER%\%WHEEL_FILENAME% || EXIT
/B 1
+pip install %_VERIFICATION_DIR%\%WHEEL_FILENAME% || EXIT /B 1
python -c "import pyarrow" || EXIT /B 1
python -c "import pyarrow.parquet" || EXIT /B 1
python -c "import pyarrow.flight" || EXIT /B 1
python -c "import pyarrow.dataset" || EXIT /B 1
-pip install -r arrow\python\requirements-test.txt || EXIT /B 1
+pip install -r %_CURRENT_DIR%\python\requirements-test.txt || EXIT /B 1
set PYARROW_TEST_CYTHON=OFF
set TZDIR=%CONDA_ENV_PATH%\share\zoneinfo
@@ -113,6 +102,6 @@ pytest %CONDA_ENV_PATH%\Lib\site-packages\pyarrow --pdb -v
|| EXIT /B 1
:done
-call deactivate
+call conda deactivate
EXIT /B 0
diff --git a/dev/release/verify-release-candidate.sh
b/dev/release/verify-release-candidate.sh
index d7ffcdb0af..21afb90d93 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -168,6 +168,7 @@ verify_dir_artifact_signatures() {
}
test_binary() {
+ # this downloads all artifacts and verifies their checksums and signatures
show_header "Testing binary artifacts"
maybe_setup_conda
@@ -176,7 +177,8 @@ test_binary() {
${PYTHON:-python3} $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \
--dest=${download_dir} \
- --repository=${GITHUB_REPOSITORY:-apache/arrow}
+ --repository=${GITHUB_REPOSITORY:-apache/arrow} \
+ --tag="apache-arrow-$VERSION-rc$RC_NUMBER"
verify_dir_artifact_signatures ${download_dir}
}
@@ -1049,11 +1051,13 @@ test_wheels() {
$SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \
--package_type python \
--regex=${filter_regex} \
- --dest=${download_dir}
+ --dest=${download_dir} \
+ --repository=${GITHUB_REPOSITORY:-apache/arrow} \
+ --tag="apache-arrow-$VERSION-rc$RC_NUMBER"
verify_dir_artifact_signatures ${download_dir}
- wheels_dir=${download_dir}/python-rc/${VERSION}-rc${RC_NUMBER}
+ wheels_dir=${download_dir}
fi
pushd ${wheels_dir}