This is an automated email from the ASF dual-hosted git repository.
sbp pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-release.git
The following commit(s) were added to refs/heads/main by this push:
new a9d2a1e Fetch URLs in parallel in the keys import script
a9d2a1e is described below
commit a9d2a1eed6d0e4a5a93f73e01aa31c0d131da831
Author: Sean B. Palmer <[email protected]>
AuthorDate: Tue Jun 17 14:09:29 2025 +0100
Fetch URLs in parallel in the keys import script
---
scripts/keys_import.py | 100 ++++++++++++++++++++++++++++++++++---------------
1 file changed, 69 insertions(+), 31 deletions(-)
diff --git a/scripts/keys_import.py b/scripts/keys_import.py
index 3a4d4da..a86ed74 100644
--- a/scripts/keys_import.py
+++ b/scripts/keys_import.py
@@ -3,53 +3,91 @@
import asyncio
import sys
-
-import httpx
-import pyinstrument
+import time
sys.path.append(".")
+
+import atr.config as config
import atr.db as db
import atr.db.interaction as interaction
+import atr.ldap as ldap
+import atr.util as util
+
+
+def get(entry, prop):
+ if prop in entry:
+ values = entry[prop]
+ if values:
+ return values[0]
+ return None
async def amain():
- # This runs in serial, and takes several minutes
- # We add about 5 keys per second, and there are around 2500 keys
- # Therefore we expect it to take about 500 seconds, which is just over 8
minutes
- profiler = pyinstrument.Profiler()
- profiler = None
- if profiler is not None:
- profiler.start()
+ # Runs as a standalone script, so we need a worker style database
connection
await db.init_database_for_worker()
+
+ # Get all email addresses in LDAP
+ # We'll discard them when we're finished
+ conf = config.AppConfig()
+ bind_dn = conf.LDAP_BIND_DN
+ bind_password = conf.LDAP_BIND_PASSWORD
+ ldap_params = ldap.SearchParameters(
+ uid_query="*",
+ bind_dn_from_config=bind_dn,
+ bind_password_from_config=bind_password,
+ email_only=True,
+ )
+ start = time.perf_counter_ns()
+ await asyncio.to_thread(ldap.search, ldap_params)
+ end = time.perf_counter_ns()
+ print("LDAP search took", (end - start) / 1000000, "ms")
+
+ # Map the LDAP addresses to Apache UIDs
+ email_to_uid = {}
+ for entry in ldap_params.results_list:
+ uid = entry.get("uid", [""])[0]
+ if mail := get(entry, "mail"):
+ email_to_uid[mail] = uid
+ if alt_email := get(entry, "asf-altEmail"):
+ email_to_uid[alt_email] = uid
+ if committer_email := get(entry, "asf-committer-email"):
+ email_to_uid[committer_email] = uid
+ print("Email addresses from LDAP:", len(email_to_uid))
+
+ # Open an ATR database connection
async with db.session() as data:
+ # Get the KEYS file of each committee
committees = await data.committee().all()
committees = list(committees)
committees.sort(key=lambda c: c.name.lower())
- limit = 10
- for i, committee in enumerate(committees):
- if (profiler is not None) and (i >= limit):
- break
- async with httpx.AsyncClient() as client:
- response = await
client.get(f"https://downloads.apache.org/{committee.name}/KEYS")
- try:
- response.raise_for_status()
- except httpx.HTTPStatusError:
- print(committee.name + ": no KEYS file")
- continue
- keys_data = await response.aread()
- keys_text = keys_data.decode("utf-8", errors="replace")
+ urls = [f"https://downloads.apache.org/{committee.name}/KEYS" for
committee in committees]
+ total_yes = 0
+ total_no = 0
+ async for url, status, content in util.get_urls_as_completed(urls):
+ # For each remote KEYS file, check that it responded 200 OK
+ committee_name = url.rsplit("/", 2)[-2]
+ if status != 200:
+ print(committee_name, "error:", status)
+ continue
+
+ # Parse the KEYS file and add it to the database
+ # TODO: We could have this return the keys to make it more
efficient
+ # Then we could use the bulk upsert query method
try:
- _result, yes, no, _committees = await interaction.upload_keys(
- [committee.name], keys_text, [committee.name]
+ _result, yes, no, _committees = await
interaction.upload_keys_bytes(
+ [committee_name], content, [committee_name],
ldap_data=email_to_uid
)
- except interaction.InteractionError as e:
- print(committee.name + ":", e)
+ except Exception as e:
+ print(committee_name, "error:", e)
continue
- print(f"{committee.name}: {yes} successes, {no} failures")
- if profiler is not None:
- profiler.stop()
- print(profiler.output_text(show_all=True, color=True))
+
+ # Print and record the number of keys that were okay and failed
+ print(committee_name, yes, no)
+ total_yes += yes
+ total_no += no
+ print("Total okay:", total_yes)
+ print("Total failed:", total_no)
def main():
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]