jenkins-bot has submitted this change. ( 
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/511334 )

Change subject: [IMPR] Weblinkchecker: throttle connections to the same host
......................................................................

[IMPR] Weblinkchecker: throttle connections to the same host

Bug: T152350
Change-Id: I894582d115013f5bf09e42bff6023c25bee6f02b
---
M scripts/weblinkchecker.py
1 file changed, 27 insertions(+), 2 deletions(-)

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index 4c49ee9..7730707 100755
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -114,6 +114,7 @@
 import re
 import threading
 import time
+import urllib.parse as urlparse
 from contextlib import suppress
 from functools import partial
 from http import HTTPStatus
@@ -122,6 +123,7 @@

 import pywikibot
 from pywikibot import comms, config, i18n, pagegenerators, textlib
+from pywikibot.backports import Dict, removeprefix
 from pywikibot.bot import ExistingPageBot, SingleSiteBot, suggest_help
 from pywikibot.exceptions import (
     IsRedirectPageError,
@@ -289,6 +291,10 @@
     After checking the page, it will die.
     """

+    #: Collecting start time of a thread for any host
+    hosts = {}  # type: Dict[str, float]
+    lock = threading.Lock()
+
     def __init__(self, page, url, history, http_ignores, day) -> None:
         """Initializer."""
         self.page = page
@@ -307,12 +313,28 @@
         self._use_fake_user_agent = config.fake_user_agent_default.get(
             'weblinkchecker', False)
         self.day = day
+        super().__init__()

-        name = '{} - {}'.format(page.title(), url.encode('utf-8', 'replace'))
-        super().__init__(name=name)
+    @classmethod
+    def get_delay(cls, name: str) -> float:
+        """Determine delay from class attribute.
+
+        Store the last call for a given hostname with an offset of
+        6 seconds to ensure there are no more than 10 calls per minute
+        for the same host. Calculate the delay to start the run.
+
+        :param name: The key for the hosts class attribute
+        :return: The calulated delay to start the run
+        """
+        now = time.monotonic()
+        with cls.lock:
+            timestamp = cls.hosts.get(name, now)
+            cls.hosts[name] = max(now, timestamp) + 6
+        return max(0, timestamp - now)

     def run(self):
         """Run the bot."""
+        time.sleep(self.get_delay(self.name))
         try:
             header = self.header
             r = comms.http.fetch(
@@ -599,6 +621,9 @@
                                          self.http_ignores, self.day)
                 # thread dies when program terminates
                 thread.daemon = True
+                # use hostname as thread.name
+                thread.name = removeprefix(
+                    urlparse.urlparse(url).hostname, 'www.')
                 self.threads.append(thread)
 


--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/511334
To unsubscribe, or for help writing mail filters, visit 
https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I894582d115013f5bf09e42bff6023c25bee6f02b
Gerrit-Change-Number: 511334
Gerrit-PatchSet: 6
Gerrit-Owner: Xqt <[email protected]>
Gerrit-Reviewer: D3r1ck01 <[email protected]>
Gerrit-Reviewer: Dalba <[email protected]>
Gerrit-Reviewer: Dvorapa <[email protected]>
Gerrit-Reviewer: Huji <[email protected]>
Gerrit-Reviewer: John Vandenberg <[email protected]>
Gerrit-Reviewer: Matěj Suchánek <[email protected]>
Gerrit-Reviewer: Merlijn van Deen <[email protected]>
Gerrit-Reviewer: Mpaa <[email protected]>
Gerrit-Reviewer: Xqt <[email protected]>
Gerrit-Reviewer: Zhuyifei1999 <[email protected]>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Rubin <[email protected]>
Gerrit-MessageType: merged
_______________________________________________
Pywikibot-commits mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to