jenkins-bot has submitted this change and it was merged.

Change subject: Improve fake user agent usage control
......................................................................


Improve fake user agent usage control

comms.http.get_fake_user_agent() is renamed to fake_user_agent() to match the 
style of user_agent(). Logic checking config variable fake_user_agent is 
removed, as it should not be responsible for deciding whether if fake UA should 
be used. Test cases testing the config-checking logic are removed.

The use_fake_user_agent argument is added to comms.http.fetch(), which will 
specify if fake UAs should be used when the method is called to make HTTP 
requests. Test cases testing this logic are added.

The fake_user_agent config variable is deprecated. fake_user_agent_default is 
introduced to set per-script behaviour. fake_user_agent_exceptions is 
introduced to set per-domain behaviours (will be checked by fetch()).

Bug: T152075
Change-Id: I28594fd1b5ccb6ed3e885db5600bb0464dccfa0e
---
M pywikibot/comms/http.py
M pywikibot/config2.py
M scripts/reflinks.py
M scripts/weblinkchecker.py
M tests/http_tests.py
5 files changed, 186 insertions(+), 61 deletions(-)

Approvals:
  John Vandenberg: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py
index 908f3f1..7bf6235 100644
--- a/pywikibot/comms/http.py
+++ b/pywikibot/comms/http.py
@@ -38,10 +38,11 @@
 
 if sys.version_info[0] > 2:
     from http import cookiejar as cookielib
-    from urllib.parse import quote
+    from urllib.parse import quote, urlparse
 else:
     import cookielib
     from urllib2 import quote
+    from urlparse import urlparse
 
 from pywikibot import config
 
@@ -53,6 +54,7 @@
 )
 from pywikibot.logging import critical, debug, error, log, warning
 from pywikibot.tools import (
+    deprecated,
     deprecate_arg,
     file_mode_checker,
     issue_deprecation_warning,
@@ -234,31 +236,43 @@
     return formatted
 
 
+@deprecated('pywikibot.comms.http.fake_user_agent')
 def get_fake_user_agent():
     """
-    Return a user agent to be used when faking a web browser.
+    Return a fake user agent depending on `fake_user_agent` option in config.
+
+    Deprecated, use fake_user_agent() instead.
 
     @rtype: str
     """
-    # Check fake_user_agent configuration variable
     if isinstance(config.fake_user_agent, StringTypes):
-        return pywikibot.config2.fake_user_agent
+        return config.fake_user_agent
+    elif config.fake_user_agent or config.fake_user_agent is None:
+        return fake_user_agent()
+    else:
+        return user_agent()
 
-    if config.fake_user_agent is None or config.fake_user_agent is True:
-        try:
-            import browseragents
-            return browseragents.core.random()
-        except ImportError:
-            pass
 
-        try:
-            import fake_useragent
-            return fake_useragent.fake.UserAgent().random
-        except ImportError:
-            pass
+def fake_user_agent():
+    """
+    Return a fake user agent.
 
-    # Use the default real user agent
-    return user_agent()
+    @rtype: str
+    """
+    try:
+        import browseragents
+        return browseragents.core.random()
+    except ImportError:
+        pass
+
+    try:
+        import fake_useragent
+        return fake_useragent.fake.UserAgent().random
+    except ImportError:
+        pass
+
+    raise ImportError(  # Actually complain when neither is installed.
+        'Either browseragents or fake_useragent must be installed to get fake 
UAs.')
 
 
 @deprecate_arg('ssl', None)
@@ -443,7 +457,7 @@
 
 
 def fetch(uri, method="GET", body=None, headers=None,
-          default_error_handling=True, **kwargs):
+          default_error_handling=True, use_fake_user_agent=False, **kwargs):
     """
     Blocking HTTP request.
 
@@ -454,8 +468,27 @@
 
     @param default_error_handling: Use default error handling
     @type default_error_handling: bool
+    @type use_fake_user_agent: bool, str
+    @param use_fake_user_agent: Set to True to use fake UA, False to use
+        pywikibot's UA, str to specify own UA. This behaviour might be
+        overridden by domain in config.
     @rtype: L{threadedhttp.HttpRequest}
     """
+    # Change user agent depending on fake UA settings.
+    # Set header to new UA if needed.
+    headers = headers or {}
+    if not headers.get('user-agent', None):  # Skip if already specified in 
request.
+        # Get fake UA exceptions from `fake_user_agent_exceptions` config.
+        uri_domain = urlparse(uri).netloc
+        use_fake_user_agent = config.fake_user_agent_exceptions.get(
+            uri_domain, use_fake_user_agent)
+
+        if use_fake_user_agent and isinstance(
+                use_fake_user_agent, StringTypes):  # Custom UA.
+            headers['user-agent'] = use_fake_user_agent
+        elif use_fake_user_agent is True:
+            headers['user-agent'] = fake_user_agent()
+
     request = _enqueue(uri, method, body, headers, **kwargs)
     assert(request._data is not None)  # if there's no data in the answer 
we're in trouble
     # Run the error handling callback in the callers thread so exceptions
diff --git a/pywikibot/config2.py b/pywikibot/config2.py
index 9451eb5..a98aeb7 100644
--- a/pywikibot/config2.py
+++ b/pywikibot/config2.py
@@ -93,7 +93,7 @@
 
 _private_values = ['authenticate', 'proxy', 'db_password']
 _deprecated_variables = ['use_SSL_onlogin', 'use_SSL_always',
-                         'available_ssl_project']
+                         'available_ssl_project', 'fake_user_agent']
 
 # ############# ACCOUNT SETTINGS ##############
 
@@ -137,16 +137,22 @@
 user_agent_format = ('{script_product} ({script_comments}) {pwb} ({revision}) '
                      '{http_backend} {python}')
 
-# Fake user agent
-# Used to retrieve pages in reflinks.py,
-# to work around user-agent sniffing webpages
-# When None or True,
-# Use random user agent if either browseragents or fake_useragent
-# packages are installed
-# Otherwise use pywikibot.comms.http.user_agent()
-# When set to False,
-# disables use of automatic user agents
-fake_user_agent = None
+# Fake user agent.
+# Some external websites reject bot-like user agents. It is possible to use
+# fake user agents in requests to these websites.
+# It is recommended to default this to False and use on an as-needed basis.
+#
+# Default behaviours in modules that can utilize fake UAs.
+# True for enabling fake UA, False for disabling / using pywikibot's own UA, 
str
+# to specify custom UA.
+fake_user_agent_default = {'reflinks': False, 'weblinkchecker': False}
+# Website domains excepted to the default behaviour.
+# True for enabling, False for disabling, str to hardcode a UA.
+# Example: {'problematic.site.example': True,
+#           'prefers.specific.ua.example': 'snakeoil/4.2'}
+fake_user_agent_exceptions = {}
+# This following option is deprecated in favour of finer control options above.
+fake_user_agent = False
 
 # The default interface for communicating with the site
 # currently the only defined interface is 'APISite', so don't change this!
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index c5daf1d..1e930a4 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -59,6 +59,7 @@
 import pywikibot
 
 from pywikibot import comms, i18n, pagegenerators, textlib, Bot
+from pywikibot import config2 as config
 from pywikibot.pagegenerators import (
     XMLDumpPageGenerator as _XMLDumpPageGenerator,
 )
@@ -395,8 +396,7 @@
         super(ReferencesRobot, self).__init__(**kwargs)
         self.generator = generator
         self.site = pywikibot.Site()
-        self._user_agent = comms.http.get_fake_user_agent()
-        pywikibot.log('Using fake user agent: {0}'.format(self._user_agent))
+        self._use_fake_user_agent = 
config.fake_user_agent_default.get('reflinks', False)
         # Check
         manual = 'mw:Manual:Pywikibot/refLinks'
         code = None
@@ -494,7 +494,6 @@
             raise
 
         editedpages = 0
-        headers = {'user-agent': self._user_agent}
         for page in self.generator:
             try:
                 # Load the page's text from the wiki
@@ -526,10 +525,11 @@
                 f = None
 
                 try:
-                    f = requests.get(ref.url, headers=headers, timeout=60)
+                    f = comms.http.fetch(
+                        ref.url, use_fake_user_agent=self._use_fake_user_agent)
 
                     # Try to get Content-Type from server
-                    contentType = f.headers.get('content-type')
+                    contentType = f.response_headers.get('content-type')
                     if contentType and not self.MIME.search(contentType):
                         if ref.link.lower().endswith('.pdf') and \
                            not self.getOption('ignorepdf'):
@@ -556,7 +556,7 @@
                         continue
 
                     # Get the real url where we end (http redirects !)
-                    redir = f.url
+                    redir = f.data.url
                     if redir != ref.link and \
                        domain.findall(redir) == domain.findall(link):
                         if soft404.search(redir) and \
@@ -572,15 +572,15 @@
                                 u'Redirect to root : {0} ', ref.link))
                             continue
 
-                    if f.status_code != requests.codes.ok:
+                    if f.status != requests.codes.ok:
                         pywikibot.output(u'HTTP error (%s) for %s on %s'
-                                         % (f.status_code, ref.url,
+                                         % (f.status, ref.url,
                                             page.title(asLink=True)),
                                          toStdout=True)
                         # 410 Gone, indicates that the resource has been 
purposely
                         # removed
-                        if f.status_code == 410 or \
-                           (f.status_code == 404 and (u'\t%s\t' % ref.url in 
deadLinks)):
+                        if f.status == 410 or \
+                           (f.status == 404 and (u'\t%s\t' % ref.url in 
deadLinks)):
                             repl = ref.refDead()
                             new_text = new_text.replace(match.group(), repl)
                         continue
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index f81d8c3..b8cb323 100755
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -279,6 +279,8 @@
     Returns a (boolean, string) tuple saying if the page is online and 
including
     a status reason.
 
+    Per-domain user-agent faking is not supported in this deprecated class.
+
     Warning: Also returns false if your Internet connection isn't working
     correctly! (This will give a Socket Error)
 
@@ -292,11 +294,19 @@
         redirectChain is a list of redirects which were resolved by
         resolveRedirect(). This is needed to detect redirect loops.
         """
-        self._user_agent = comms.http.get_fake_user_agent()
         self.url = url
         self.serverEncoding = serverEncoding
+
+        fake_ua_config = config.fake_user_agent_default.get(
+            'weblinkchecker', False)
+        if fake_ua_config and isinstance(fake_ua_config, str):
+            user_agent = fake_ua_config
+        elif fake_ua_config:
+            user_agent = comms.http.fake_user_agent()
+        else:
+            user_agent = comms.http.user_agent()
         self.header = {
-            'User-agent': self._user_agent,
+            'user-agent': user_agent,
             'Accept': 'text/xml,application/xml,application/xhtml+xml,'
                       'text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
             'Accept-Language': 'de-de,de;q=0.8,en-us;q=0.5,en;q=0.3',
@@ -542,10 +552,8 @@
         threading.Thread.__init__(self)
         self.page = page
         self.url = url
-        self._user_agent = comms.http.get_fake_user_agent()
         self.history = history
         self.header = {
-            'User-agent': self._user_agent,
             'Accept': 'text/xml,application/xml,application/xhtml+xml,'
                       'text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
             'Accept-Language': 'de-de,de;q=0.8,en-us;q=0.5,en;q=0.3',
@@ -557,6 +565,8 @@
         self.setName((u'%s - %s' % (page.title(), url)).encode('utf-8',
                                                                'replace'))
         self.HTTPignore = HTTPignore
+        self._use_fake_user_agent = config.fake_user_agent_default.get(
+            'weblinkchecker', False)
         self.day = day
 
     def run(self):
@@ -564,8 +574,8 @@
         ok = False
         try:
             header = self.header
-            timeout = pywikibot.config.socket_timeout
-            r = requests.get(self.url, headers=header, timeout=timeout)
+            r = comms.http.fetch(
+                self.url, headers=header, 
use_fake_user_agent=self._use_fake_user_agent)
         except requests.exceptions.InvalidURL:
             message = i18n.twtranslate(self.page.site,
                                        'weblinkchecker-badurl_msg',
@@ -574,11 +584,11 @@
             pywikibot.output('Exception while processing URL %s in page %s'
                              % (self.url, self.page.title()))
             raise
-        if (r.status_code == requests.codes.ok and
-                str(r.status_code) not in self.HTTPignore):
+        if (r.status == requests.codes.ok and
+                str(r.status) not in self.HTTPignore):
             ok = True
         else:
-            message = '{0} {1}'.format(r.status_code, r.reason)
+            message = '{0}'.format(r.status)
         if ok:
             if self.history.setLinkAlive(self.url):
                 pywikibot.output('*Link to %s in [[%s]] is back alive.'
diff --git a/tests/http_tests.py b/tests/http_tests.py
index 05df31b..06f7117 100644
--- a/tests/http_tests.py
+++ b/tests/http_tests.py
@@ -285,7 +285,7 @@
         self.assertIn('Python/' + str(PYTHON_VERSION[0]), http.user_agent())
 
 
-class FakeUserAgentTestCase(TestCase):
+class DryFakeUserAgentTestCase(TestCase):
 
     """Test the generation of fake user agents.
 
@@ -296,15 +296,96 @@
 
     net = False
 
+    def _test_fake_user_agent_randomness(self):
+        """Test if user agent returns are randomized."""
+        self.assertNotEqual(http.fake_user_agent(), http.fake_user_agent())
+
+    @require_modules('browseragents')
+    def test_with_browseragents(self):
+        """Test fake user agent generation with browseragents module."""
+        self._test_fake_user_agent_randomness()
+
+    @require_modules('fake_useragent')
+    def test_with_fake_useragent(self):
+        """Test fake user agent generation with fake_useragent module."""
+        self._test_fake_user_agent_randomness()
+
+
+class LiveFakeUserAgentTestCase(TestCase):
+
+    """Test the usage of fake user agent."""
+
+    sites = {
+        'httpbin': {
+            'hostname': 'httpbin.org',
+        },
+    }
+
+    def setUp(self):
+        """Set up the unit test."""
+        self.orig_fake_user_agent_exceptions = 
config.fake_user_agent_exceptions
+        super(LiveFakeUserAgentTestCase, self).setUp()
+
+    def tearDown(self):
+        """Tear down unit test."""
+        config.fake_user_agent_exceptions = 
self.orig_fake_user_agent_exceptions
+        super(LiveFakeUserAgentTestCase, self).tearDown()
+
+    def _test_fetch_use_fake_user_agent(self):
+        """Test `use_fake_user_agent` argument of http.fetch."""
+        # Existing headers
+        r = http.fetch(
+            'http://httpbin.org/status/200', headers={'user-agent': 
'EXISTING'})
+        self.assertEqual(r.headers['user-agent'], 'EXISTING')
+
+        # Argument value changes
+        r = http.fetch('http://httpbin.org/status/200', 
use_fake_user_agent=True)
+        self.assertNotEqual(r.headers['user-agent'], http.user_agent())
+        r = http.fetch('http://httpbin.org/status/200', 
use_fake_user_agent=False)
+        self.assertEqual(r.headers['user-agent'], http.user_agent())
+        r = http.fetch(
+            'http://httpbin.org/status/200', use_fake_user_agent='ARBITRARY')
+        self.assertEqual(r.headers['user-agent'], 'ARBITRARY')
+
+        # Manually overridden domains
+        config.fake_user_agent_exceptions = {'httpbin.org': 'OVERRIDDEN'}
+        r = http.fetch(
+            'http://httpbin.org/status/200', use_fake_user_agent=False)
+        self.assertEqual(r.headers['user-agent'], 'OVERRIDDEN')
+
+    @require_modules('browseragents')
+    def test_fetch_with_browseragents(self):
+        """Test method with browseragents module."""
+        self._test_fetch_use_fake_user_agent()
+
+    @require_modules('fake_useragent')
+    def test_fetch_with_fake_useragent(self):
+        """Test method with fake_useragent module."""
+        self._test_fetch_use_fake_user_agent()
+
+
+class GetFakeUserAgentTestCase(TestCase):
+
+    """Test the deprecated get_fake_user_agent()."""
+
+    net = False
+
     def setUp(self):
         """Set up unit test."""
         self.orig_fake_user_agent = config.fake_user_agent
+        super(GetFakeUserAgentTestCase, self).setUp()
 
     def tearDown(self):
         """Tear down unit test."""
         config.fake_user_agent = self.orig_fake_user_agent
+        super(GetFakeUserAgentTestCase, self).tearDown()
 
-    def _test_fake_user_agent_config(self):
+    def _test_fake_user_agent_randomness(self):
+        """Test if user agent returns are randomized."""
+        config.fake_user_agent = True
+        self.assertNotEqual(http.get_fake_user_agent(), 
http.get_fake_user_agent())
+
+    def _test_config_settings(self):
         """Test if method honours configuration toggle."""
         # ON: True and None in config are considered turned on.
         config.fake_user_agent = True
@@ -315,25 +396,20 @@
         # OFF: All other values won't make it return random UA.
         config.fake_user_agent = False
         self.assertEqual(http.get_fake_user_agent(), http.user_agent())
-        config.fake_user_agent = 'ArbitraryValue'
-        self.assertEqual(http.get_fake_user_agent(), 'ArbitraryValue')
-
-    def _test_fake_user_agent_randomness(self):
-        """Test if user agent returns are randomized."""
-        config.fake_user_agent = True
-        self.assertNotEqual(http.get_fake_user_agent(), 
http.get_fake_user_agent())
+        config.fake_user_agent = 'ARBITRARY'
+        self.assertEqual(http.get_fake_user_agent(), 'ARBITRARY')
 
     @require_modules('browseragents')
     def test_with_browseragents(self):
-        """Test fake user agent generation with browseragents module."""
-        self._test_fake_user_agent_config()
+        """Test method with browseragents module."""
         self._test_fake_user_agent_randomness()
+        self._test_config_settings()
 
     @require_modules('fake_useragent')
     def test_with_fake_useragent(self):
-        """Test fake user agent generation with fake_useragent module."""
-        self._test_fake_user_agent_config()
+        """Test method with fake_useragent module."""
         self._test_fake_user_agent_randomness()
+        self._test_config_settings()
 
 
 class CharsetTestCase(TestCase):

-- 
To view, visit https://gerrit.wikimedia.org/r/325241
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I28594fd1b5ccb6ed3e885db5600bb0464dccfa0e
Gerrit-PatchSet: 17
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Dargasia <t...@riseup.net>
Gerrit-Reviewer: Dargasia <t...@riseup.net>
Gerrit-Reviewer: John Vandenberg <jay...@gmail.com>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to