This is an automated email from the ASF dual-hosted git repository.

sbp pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tooling-trusted-release.git


The following commit(s) were added to refs/heads/main by this push:
     new a9d2a1e  Fetch URLs in parallel in the keys import script
a9d2a1e is described below

commit a9d2a1eed6d0e4a5a93f73e01aa31c0d131da831
Author: Sean B. Palmer <[email protected]>
AuthorDate: Tue Jun 17 14:09:29 2025 +0100

    Fetch URLs in parallel in the keys import script
---
 scripts/keys_import.py | 100 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 69 insertions(+), 31 deletions(-)

diff --git a/scripts/keys_import.py b/scripts/keys_import.py
index 3a4d4da..a86ed74 100644
--- a/scripts/keys_import.py
+++ b/scripts/keys_import.py
@@ -3,53 +3,91 @@
 
 import asyncio
 import sys
-
-import httpx
-import pyinstrument
+import time
 
 sys.path.append(".")
 
+
+import atr.config as config
 import atr.db as db
 import atr.db.interaction as interaction
+import atr.ldap as ldap
+import atr.util as util
+
+
+def get(entry, prop):
+    if prop in entry:
+        values = entry[prop]
+        if values:
+            return values[0]
+    return None
 
 
 async def amain():
-    # This runs in serial, and takes several minutes
-    # We add about 5 keys per second, and there are around 2500 keys
-    # Therefore we expect it to take about 500 seconds, which is just over 8 
minutes
-    profiler = pyinstrument.Profiler()
-    profiler = None
-    if profiler is not None:
-        profiler.start()
+    # Runs as a standalone script, so we need a worker style database 
connection
     await db.init_database_for_worker()
+
+    # Get all email addresses in LDAP
+    # We'll discard them when we're finished
+    conf = config.AppConfig()
+    bind_dn = conf.LDAP_BIND_DN
+    bind_password = conf.LDAP_BIND_PASSWORD
+    ldap_params = ldap.SearchParameters(
+        uid_query="*",
+        bind_dn_from_config=bind_dn,
+        bind_password_from_config=bind_password,
+        email_only=True,
+    )
+    start = time.perf_counter_ns()
+    await asyncio.to_thread(ldap.search, ldap_params)
+    end = time.perf_counter_ns()
+    print("LDAP search took", (end - start) / 1000000, "ms")
+
+    # Map the LDAP addresses to Apache UIDs
+    email_to_uid = {}
+    for entry in ldap_params.results_list:
+        uid = entry.get("uid", [""])[0]
+        if mail := get(entry, "mail"):
+            email_to_uid[mail] = uid
+        if alt_email := get(entry, "asf-altEmail"):
+            email_to_uid[alt_email] = uid
+        if committer_email := get(entry, "asf-committer-email"):
+            email_to_uid[committer_email] = uid
+    print("Email addresses from LDAP:", len(email_to_uid))
+
+    # Open an ATR database connection
     async with db.session() as data:
+        # Get the KEYS file of each committee
         committees = await data.committee().all()
         committees = list(committees)
         committees.sort(key=lambda c: c.name.lower())
-        limit = 10
-        for i, committee in enumerate(committees):
-            if (profiler is not None) and (i >= limit):
-                break
-            async with httpx.AsyncClient() as client:
-                response = await 
client.get(f"https://downloads.apache.org/{committee.name}/KEYS";)
-                try:
-                    response.raise_for_status()
-                except httpx.HTTPStatusError:
-                    print(committee.name + ": no KEYS file")
-                    continue
-                keys_data = await response.aread()
-            keys_text = keys_data.decode("utf-8", errors="replace")
+        urls = [f"https://downloads.apache.org/{committee.name}/KEYS"; for 
committee in committees]
+        total_yes = 0
+        total_no = 0
+        async for url, status, content in util.get_urls_as_completed(urls):
+            # For each remote KEYS file, check that it responded 200 OK
+            committee_name = url.rsplit("/", 2)[-2]
+            if status != 200:
+                print(committee_name, "error:", status)
+                continue
+
+            # Parse the KEYS file and add it to the database
+            # TODO: We could have this return the keys to make it more 
efficient
+            # Then we could use the bulk upsert query method
             try:
-                _result, yes, no, _committees = await interaction.upload_keys(
-                    [committee.name], keys_text, [committee.name]
+                _result, yes, no, _committees = await 
interaction.upload_keys_bytes(
+                    [committee_name], content, [committee_name], 
ldap_data=email_to_uid
                 )
-            except interaction.InteractionError as e:
-                print(committee.name + ":", e)
+            except Exception as e:
+                print(committee_name, "error:", e)
                 continue
-            print(f"{committee.name}: {yes} successes, {no} failures")
-    if profiler is not None:
-        profiler.stop()
-        print(profiler.output_text(show_all=True, color=True))
+
+            # Print and record the number of keys that were okay and failed
+            print(committee_name, yes, no)
+            total_yes += yes
+            total_no += no
+        print("Total okay:", total_yes)
+        print("Total failed:", total_no)
 
 
 def main():


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to