This is an automated email from the ASF dual-hosted git repository. assignuser pushed a commit to branch release-20.0.0-rc1 in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 1e2aa39012961e85a719857f0089fa04ab9119b3 Author: Jacob Wujciak-Jens <[email protected]> AuthorDate: Tue Apr 15 17:27:09 2025 +0200 GH-46075: [Release][CI] Fix binary verification (#46076) ### Rationale for this change ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? **This PR includes breaking changes to public APIs.** (If there are any breaking changes to public APIs, please explain which changes are breaking. If not, you can remove this.) **This PR contains a "Critical Fix".** (If the changes fix either (a) a security vulnerability, (b) a bug that caused incorrect or invalid data to be produced, or (c) a bug that causes a crash (even when the API contract is upheld), please provide explanation. If not, you can remove this.) * GitHub Issue: #46075 Authored-by: Jacob Wujciak-Jens <[email protected]> Signed-off-by: Jacob Wujciak-Jens <[email protected]> --- .github/workflows/verify_rc.yml | 44 ++++-- dev/release/download_rc_binaries.py | 178 ++++++++++++++---------- dev/release/verify-release-candidate-wheels.bat | 31 ++--- dev/release/verify-release-candidate.sh | 10 +- 4 files changed, 155 insertions(+), 108 deletions(-) diff --git a/.github/workflows/verify_rc.yml b/.github/workflows/verify_rc.yml index fe46ae6f23..dceb04a492 100644 --- a/.github/workflows/verify_rc.yml +++ b/.github/workflows/verify_rc.yml @@ -21,6 +21,16 @@ on: push: tags: - "*-rc*" + pull_request: + paths: + - ".github/workflows/verify_rc.yml" + workflow_dispatch: + inputs: + rc_tag: + description: "Tag of the rc to verify" + type: string + required: true + permissions: contents: read @@ -28,6 +38,7 @@ permissions: env: TEST_DEFAULT: "0" VERBOSE: "1" + RC_TAG: "${{ inputs.rc_tag || github.event_name == 'pull_request' && 'apache-arrow-20.0.0-rc0' || github.ref_name }}" jobs: apt: @@ -46,9 +57,9 @@ jobs: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Run run: | - package_id=${GITHUB_REF_NAME%-rc*} + package_id=${RC_TAG%-rc*} version=${package_id#apache-arrow-} - rc=${GITHUB_REF_NAME#*-rc} + rc=${RC_TAG#*-rc} dev/release/verify-release-candidate.sh ${version} ${rc} binary: @@ -61,9 +72,9 @@ jobs: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Run run: | - package_id=${GITHUB_REF_NAME%-rc*} + package_id=${RC_TAG%-rc*} version=${package_id#apache-arrow-} - rc=${GITHUB_REF_NAME#*-rc} + rc=${RC_TAG#*-rc} dev/release/verify-release-candidate.sh ${version} ${rc} wheels-linux: @@ -89,9 +100,9 @@ jobs: run: python -m pip install -e dev/archery[docker] - name: Prepare run: | - package_id=${GITHUB_REF_NAME%-rc*} + package_id=${RC_TAG%-rc*} echo "VERSION=${package_id#apache-arrow-}" >> ${GITHUB_ENV} - echo "RC=${GITHUB_REF_NAME#*-rc}" >> ${GITHUB_ENV} + echo "RC=${RC_TAG#*-rc}" >> ${GITHUB_ENV} distro=${{ matrix.distro }} if [ "${distro}" = "conda" ]; then echo "SERVICE=${distro}-verify-rc" >> ${GITHUB_ENV} @@ -102,6 +113,8 @@ jobs: echo "$(echo ${os} | tr a-z A-Z)=${version}" >> ${GITHUB_ENV} fi - name: Run + env: + GH_TOKEN: ${{ github.token }} run: | archery docker run \ -e TEST_DEFAULT="${TEST_DEFAULT}" \ @@ -109,6 +122,7 @@ jobs: -e VERBOSE="${VERBOSE}" \ -e VERIFY_RC="${RC}" \ -e VERIFY_VERSION="${VERSION}" \ + -e GH_TOKEN="$GH_TOKEN" \ ${SERVICE} wheels-macos: @@ -126,10 +140,12 @@ jobs: steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Run + env: + GH_TOKEN: ${{ github.token }} run: | - package_id=${GITHUB_REF_NAME%-rc*} + package_id=${RC_TAG%-rc*} version=${package_id#apache-arrow-} - rc=${GITHUB_REF_NAME#*-rc} + rc=${RC_TAG#*-rc} dev/release/verify-release-candidate.sh ${version} ${rc} wheels-windows: @@ -141,12 +157,14 @@ jobs: TEST_WHEELS: "1" steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + submodules: recursive - name: Prepare shell: bash run: | - package_id=${GITHUB_REF_NAME%-rc*} + package_id=${RC_TAG%-rc*} echo "VERSION=${package_id#apache-arrow-}" >> ${GITHUB_ENV} - echo "RC=${GITHUB_REF_NAME#*-rc}" >> ${GITHUB_ENV} + echo "RC=${RC_TAG#*-rc}" >> ${GITHUB_ENV} - uses: conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # v3.1.1 - name: Install System Dependencies run: | @@ -156,6 +174,8 @@ jobs: shell: bash run: ci/scripts/download_tz_database.sh - name: Run verification + env: + GH_TOKEN: ${{ github.token }} shell: cmd run: | dev/release/verify-release-candidate-wheels.bat %VERSION% %RC% @@ -176,7 +196,7 @@ jobs: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Run run: | - package_id=${GITHUB_REF_NAME%-rc*} + package_id=${RC_TAG%-rc*} version=${package_id#apache-arrow-} - rc=${GITHUB_REF_NAME#*-rc} + rc=${RC_TAG#*-rc} dev/release/verify-release-candidate.sh ${version} ${rc} diff --git a/dev/release/download_rc_binaries.py b/dev/release/download_rc_binaries.py index 3bc0012116..01f6588c6d 100755 --- a/dev/release/download_rc_binaries.py +++ b/dev/release/download_rc_binaries.py @@ -28,7 +28,6 @@ import subprocess import time import urllib.request - DEFAULT_PARALLEL_DOWNLOADS = 8 @@ -36,33 +35,31 @@ class Downloader: def get_file_list(self, prefix, filter=None): def traverse(directory, files, directories): - url = f'{self.URL_ROOT}/{directory}' + url = f"{self.URL_ROOT}/{directory}" response = urllib.request.urlopen(url).read().decode() paths = re.findall('<a href="(.+?)"', response) for path in paths: - path = re.sub(f'^{re.escape(url)}', - '', - path) - if path == '../': + path = re.sub(f"^{re.escape(url)}", "", path) + if path == "../": continue - resolved_path = f'{directory}{path}' + resolved_path = f"{directory}{path}" if filter and not filter(path): continue - if path.endswith('/'): + if path.endswith("/"): directories.append(resolved_path) else: files.append(resolved_path) + files = [] - if prefix != '' and not prefix.endswith('/'): - prefix += '/' + if prefix != "" and not prefix.endswith("/"): + prefix += "/" directories = [prefix] while len(directories) > 0: directory = directories.pop() traverse(directory, files, directories) return files - def download_files(self, files, dest=None, num_parallel=None, - re_match=None): + def download_files(self, files, dest=None, num_parallel=None, re_match=None): """ Download files from Bintray in parallel. If file already exists, will overwrite if the checksum does not match what Bintray says it should be @@ -83,19 +80,21 @@ class Downloader: num_parallel = DEFAULT_PARALLEL_DOWNLOADS if re_match is not None: - regex = re.compile(re_match) - files = [x for x in files if regex.match(x)] + files = self._filter_files(files, re_match) if num_parallel == 1: for path in files: self._download_file(dest, path) else: parallel_map_terminate_early( - functools.partial(self._download_file, dest), - files, - num_parallel + functools.partial(self._download_file, + dest), files, num_parallel ) + def _filter_files(self, files, re_match): + regex = re.compile(re_match) + return [x for x in files if regex.match(x)] + def _download_file(self, dest, path): base, filename = os.path.split(path) @@ -106,7 +105,7 @@ class Downloader: print("Downloading {} to {}".format(path, dest_path)) - url = f'{self.URL_ROOT}/{path}' + url = f"{self.URL_ROOT}/{path}" self._download_url(url, dest_path) def _download_url(self, url, dest_path, *, extra_args=None): @@ -128,8 +127,8 @@ class Downloader: delay = attempt * 3 print(f"Waiting {delay} seconds before retrying {url}") time.sleep(delay) - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + proc = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() if proc.returncode != 0: try: @@ -142,8 +141,9 @@ class Downloader: break else: return - raise Exception(f"Downloading {url} failed\n" - f"stdout: {stdout}\nstderr: {stderr}") + raise Exception( + f"Downloading {url} failed\n" f"stdout: {stdout}\nstderr: {stderr}" + ) def _curl_version(self): cmd = ["curl", "--version"] @@ -157,8 +157,10 @@ class Artifactory(Downloader): class Maven(Downloader): - URL_ROOT = "https://repository.apache.org" + \ - "/content/repositories/staging/org/apache/arrow" + URL_ROOT = ( + "https://repository.apache.org" + + "/content/repositories/staging/org/apache/arrow" + ) class GitHub(Downloader): @@ -174,8 +176,10 @@ class GitHub(Downloader): self._token = os.environ.get("GH_TOKEN") def get_file_list(self, prefix, filter=None): - url = (f"https://api.github.com/repos/{self._repository}/" - f"releases/tags/{self._tag}") + url = ( + f"https://api.github.com/repos/{self._repository}/" + f"releases/tags/{self._tag}" + ) print("Fetching release from", url) headers = { "Accept": "application/vnd.github+json", @@ -204,6 +208,10 @@ class GitHub(Downloader): files.append((asset["name"], url)) return files + def _filter_files(self, files, re_match): + regex = re.compile(re_match) + return [x for x in files if regex.match(x[0])] + def _download_file(self, dest, asset): name, url = asset @@ -226,11 +234,7 @@ class GitHub(Downloader): if self._curl_version() >= (7, 71, 0): # Also retry 403s extra_args.append("--retry-all-errors") - self._download_url( - url, - dest_path, - extra_args=extra_args - ) + self._download_url(url, dest_path, extra_args=extra_args) def parallel_map_terminate_early(f, iterable, num_parallel): @@ -248,38 +252,45 @@ def parallel_map_terminate_early(f, iterable, num_parallel): ARROW_REPOSITORY_PACKAGE_TYPES = [ - 'almalinux', - 'amazon-linux', - 'centos', - 'debian', - 'ubuntu', + "almalinux", + "amazon-linux", + "centos", + "debian", + "ubuntu", ] -ARROW_STANDALONE_PACKAGE_TYPES = ['nuget', 'python'] -ARROW_PACKAGE_TYPES = \ - ARROW_REPOSITORY_PACKAGE_TYPES + \ - ARROW_STANDALONE_PACKAGE_TYPES - - -def download_rc_binaries(version, rc_number, re_match=None, dest=None, - num_parallel=None, target_package_type=None, - repository=None, tag=None): - version_string = '{}-rc{}'.format(version, rc_number) - version_pattern = re.compile(r'\d+\.\d+\.\d+') +ARROW_STANDALONE_PACKAGE_TYPES = ["nuget", "python"] +ARROW_PACKAGE_TYPES = ARROW_REPOSITORY_PACKAGE_TYPES + ARROW_STANDALONE_PACKAGE_TYPES + + +def download_rc_binaries( + version, + rc_number, + re_match=None, + dest=None, + num_parallel=None, + target_package_type=None, + repository=None, + tag=None, +): + version_string = "{}-rc{}".format(version, rc_number) + version_pattern = re.compile(r"\d+\.\d+\.\d+") if target_package_type: package_types = [target_package_type] else: package_types = ARROW_PACKAGE_TYPES for package_type in package_types: + def is_target(path): match = version_pattern.search(path) if not match: return True return match[0] == version + filter = is_target - if package_type == 'github' or package_type == 'nuget': + if package_type == "github" or package_type in ARROW_STANDALONE_PACKAGE_TYPES: downloader = GitHub(repository, tag) - prefix = '' + prefix = "" filter = None elif package_type in ARROW_REPOSITORY_PACKAGE_TYPES: downloader = Artifactory() @@ -289,33 +300,56 @@ def download_rc_binaries(version, rc_number, re_match=None, dest=None, prefix = f'{package_type}-rc/{version_string}' filter = None files = downloader.get_file_list(prefix, filter=filter) - downloader.download_files(files, re_match=re_match, dest=dest, - num_parallel=num_parallel) + downloader.download_files( + files, re_match=re_match, dest=dest, num_parallel=num_parallel + ) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( - description='Download release candidate binaries' + description="Download release candidate binaries") + parser.add_argument("version", type=str, help="The version number") + parser.add_argument( + "rc_number", type=int, help="The release candidate number, e.g. 0, 1, etc" + ) + parser.add_argument( + "-e", + "--regexp", + type=str, + default=None, + help=( + "Regular expression to match on file names " + "to only download certain files" + ), + ) + parser.add_argument( + "--dest", + type=str, + default=os.getcwd(), + help="The output folder for the downloaded files", + ) + parser.add_argument( + "--num_parallel", + type=int, + default=DEFAULT_PARALLEL_DOWNLOADS, + help="The number of concurrent downloads to do", + ) + parser.add_argument( + "--package_type", + type=str, + default=None, + help="The package type to be downloaded", + ) + parser.add_argument( + "--repository", + type=str, + help=("The repository to pull from " "(required if --package_type=github)"), + ) + parser.add_argument( + "--tag", + type=str, + help=("The release tag to download " "(required if --package_type=github)"), ) - parser.add_argument('version', type=str, help='The version number') - parser.add_argument('rc_number', type=int, - help='The release candidate number, e.g. 0, 1, etc') - parser.add_argument('-e', '--regexp', type=str, default=None, - help=('Regular expression to match on file names ' - 'to only download certain files')) - parser.add_argument('--dest', type=str, default=os.getcwd(), - help='The output folder for the downloaded files') - parser.add_argument('--num_parallel', type=int, - default=DEFAULT_PARALLEL_DOWNLOADS, - help='The number of concurrent downloads to do') - parser.add_argument('--package_type', type=str, default=None, - help='The package type to be downloaded') - parser.add_argument('--repository', type=str, - help=('The repository to pull from ' - '(required if --package_type=github)')) - parser.add_argument('--tag', type=str, - help=('The release tag to download ' - '(required if --package_type=github)')) args = parser.parse_args() download_rc_binaries( diff --git a/dev/release/verify-release-candidate-wheels.bat b/dev/release/verify-release-candidate-wheels.bat index a9a4703fae..e41d2dbf25 100644 --- a/dev/release/verify-release-candidate-wheels.bat +++ b/dev/release/verify-release-candidate-wheels.bat @@ -33,29 +33,18 @@ if not exist %_VERIFICATION_DIR% mkdir %_VERIFICATION_DIR% cd %_VERIFICATION_DIR% -@rem clone Arrow repository to obtain test requirements -set GIT_ENV_PATH=%_VERIFICATION_DIR%\_git -call conda create -p %GIT_ENV_PATH% ^ - --no-shortcuts -f -q -y git ^ - || EXIT /B 1 -call activate %GIT_ENV_PATH% - -git clone https://github.com/apache/arrow.git || EXIT /B 1 -pushd arrow -git submodule update --init -popd - set ARROW_VERSION=%1 set RC_NUMBER=%2 -python arrow\dev\release\download_rc_binaries.py %ARROW_VERSION% %RC_NUMBER% ^ - --package_type python ^ +python dev\release\download_rc_binaries.py %ARROW_VERSION% %RC_NUMBER% ^ + --package_type="python" ^ + --repository="apache/arrow" ^ + --dest="%_VERIFICATION_DIR%" ^ + --tag="apache-arrow-%ARROW_VERSION%-rc%RC_NUMBER%" ^ --regex=".*win_amd64.*" || EXIT /B 1 -call deactivate - -set ARROW_TEST_DATA=%cd%\arrow\testing\data -set PARQUET_TEST_DATA=%cd%\arrow\cpp\submodules\parquet-testing\data +set ARROW_TEST_DATA=%cd%\testing\data +set PARQUET_TEST_DATA=%cd%\cpp\submodules\parquet-testing\data CALL :verify_wheel 3.9 @@ -99,13 +88,13 @@ call activate %CONDA_ENV_PATH% set WHEEL_FILENAME=pyarrow-%ARROW_VERSION%-cp%PY_VERSION_NO_PERIOD%-cp%PY_VERSION_NO_PERIOD%%ABI_TAG%-win_amd64.whl -pip install python-rc\%ARROW_VERSION%-rc%RC_NUMBER%\%WHEEL_FILENAME% || EXIT /B 1 +pip install %_VERIFICATION_DIR%\%WHEEL_FILENAME% || EXIT /B 1 python -c "import pyarrow" || EXIT /B 1 python -c "import pyarrow.parquet" || EXIT /B 1 python -c "import pyarrow.flight" || EXIT /B 1 python -c "import pyarrow.dataset" || EXIT /B 1 -pip install -r arrow\python\requirements-test.txt || EXIT /B 1 +pip install -r %_CURRENT_DIR%\python\requirements-test.txt || EXIT /B 1 set PYARROW_TEST_CYTHON=OFF set TZDIR=%CONDA_ENV_PATH%\share\zoneinfo @@ -113,6 +102,6 @@ pytest %CONDA_ENV_PATH%\Lib\site-packages\pyarrow --pdb -v || EXIT /B 1 :done -call deactivate +call conda deactivate EXIT /B 0 diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index d7ffcdb0af..21afb90d93 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -168,6 +168,7 @@ verify_dir_artifact_signatures() { } test_binary() { + # this downloads all artifacts and verifies their checksums and signatures show_header "Testing binary artifacts" maybe_setup_conda @@ -176,7 +177,8 @@ test_binary() { ${PYTHON:-python3} $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \ --dest=${download_dir} \ - --repository=${GITHUB_REPOSITORY:-apache/arrow} + --repository=${GITHUB_REPOSITORY:-apache/arrow} \ + --tag="apache-arrow-$VERSION-rc$RC_NUMBER" verify_dir_artifact_signatures ${download_dir} } @@ -1049,11 +1051,13 @@ test_wheels() { $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER \ --package_type python \ --regex=${filter_regex} \ - --dest=${download_dir} + --dest=${download_dir} \ + --repository=${GITHUB_REPOSITORY:-apache/arrow} \ + --tag="apache-arrow-$VERSION-rc$RC_NUMBER" verify_dir_artifact_signatures ${download_dir} - wheels_dir=${download_dir}/python-rc/${VERSION}-rc${RC_NUMBER} + wheels_dir=${download_dir} fi pushd ${wheels_dir}
