jenkins-bot has submitted this change and it was merged. Change subject: Improve fake user agent usage control ......................................................................
Improve fake user agent usage control comms.http.get_fake_user_agent() is renamed to fake_user_agent() to match the style of user_agent(). Logic checking config variable fake_user_agent is removed, as it should not be responsible for deciding whether if fake UA should be used. Test cases testing the config-checking logic are removed. The use_fake_user_agent argument is added to comms.http.fetch(), which will specify if fake UAs should be used when the method is called to make HTTP requests. Test cases testing this logic are added. The fake_user_agent config variable is deprecated. fake_user_agent_default is introduced to set per-script behaviour. fake_user_agent_exceptions is introduced to set per-domain behaviours (will be checked by fetch()). Bug: T152075 Change-Id: I28594fd1b5ccb6ed3e885db5600bb0464dccfa0e --- M pywikibot/comms/http.py M pywikibot/config2.py M scripts/reflinks.py M scripts/weblinkchecker.py M tests/http_tests.py 5 files changed, 186 insertions(+), 61 deletions(-) Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py index 908f3f1..7bf6235 100644 --- a/pywikibot/comms/http.py +++ b/pywikibot/comms/http.py @@ -38,10 +38,11 @@ if sys.version_info[0] > 2: from http import cookiejar as cookielib - from urllib.parse import quote + from urllib.parse import quote, urlparse else: import cookielib from urllib2 import quote + from urlparse import urlparse from pywikibot import config @@ -53,6 +54,7 @@ ) from pywikibot.logging import critical, debug, error, log, warning from pywikibot.tools import ( + deprecated, deprecate_arg, file_mode_checker, issue_deprecation_warning, @@ -234,31 +236,43 @@ return formatted +@deprecated('pywikibot.comms.http.fake_user_agent') def get_fake_user_agent(): """ - Return a user agent to be used when faking a web browser. + Return a fake user agent depending on `fake_user_agent` option in config. + + Deprecated, use fake_user_agent() instead. @rtype: str """ - # Check fake_user_agent configuration variable if isinstance(config.fake_user_agent, StringTypes): - return pywikibot.config2.fake_user_agent + return config.fake_user_agent + elif config.fake_user_agent or config.fake_user_agent is None: + return fake_user_agent() + else: + return user_agent() - if config.fake_user_agent is None or config.fake_user_agent is True: - try: - import browseragents - return browseragents.core.random() - except ImportError: - pass - try: - import fake_useragent - return fake_useragent.fake.UserAgent().random - except ImportError: - pass +def fake_user_agent(): + """ + Return a fake user agent. - # Use the default real user agent - return user_agent() + @rtype: str + """ + try: + import browseragents + return browseragents.core.random() + except ImportError: + pass + + try: + import fake_useragent + return fake_useragent.fake.UserAgent().random + except ImportError: + pass + + raise ImportError( # Actually complain when neither is installed. + 'Either browseragents or fake_useragent must be installed to get fake UAs.') @deprecate_arg('ssl', None) @@ -443,7 +457,7 @@ def fetch(uri, method="GET", body=None, headers=None, - default_error_handling=True, **kwargs): + default_error_handling=True, use_fake_user_agent=False, **kwargs): """ Blocking HTTP request. @@ -454,8 +468,27 @@ @param default_error_handling: Use default error handling @type default_error_handling: bool + @type use_fake_user_agent: bool, str + @param use_fake_user_agent: Set to True to use fake UA, False to use + pywikibot's UA, str to specify own UA. This behaviour might be + overridden by domain in config. @rtype: L{threadedhttp.HttpRequest} """ + # Change user agent depending on fake UA settings. + # Set header to new UA if needed. + headers = headers or {} + if not headers.get('user-agent', None): # Skip if already specified in request. + # Get fake UA exceptions from `fake_user_agent_exceptions` config. + uri_domain = urlparse(uri).netloc + use_fake_user_agent = config.fake_user_agent_exceptions.get( + uri_domain, use_fake_user_agent) + + if use_fake_user_agent and isinstance( + use_fake_user_agent, StringTypes): # Custom UA. + headers['user-agent'] = use_fake_user_agent + elif use_fake_user_agent is True: + headers['user-agent'] = fake_user_agent() + request = _enqueue(uri, method, body, headers, **kwargs) assert(request._data is not None) # if there's no data in the answer we're in trouble # Run the error handling callback in the callers thread so exceptions diff --git a/pywikibot/config2.py b/pywikibot/config2.py index 9451eb5..a98aeb7 100644 --- a/pywikibot/config2.py +++ b/pywikibot/config2.py @@ -93,7 +93,7 @@ _private_values = ['authenticate', 'proxy', 'db_password'] _deprecated_variables = ['use_SSL_onlogin', 'use_SSL_always', - 'available_ssl_project'] + 'available_ssl_project', 'fake_user_agent'] # ############# ACCOUNT SETTINGS ############## @@ -137,16 +137,22 @@ user_agent_format = ('{script_product} ({script_comments}) {pwb} ({revision}) ' '{http_backend} {python}') -# Fake user agent -# Used to retrieve pages in reflinks.py, -# to work around user-agent sniffing webpages -# When None or True, -# Use random user agent if either browseragents or fake_useragent -# packages are installed -# Otherwise use pywikibot.comms.http.user_agent() -# When set to False, -# disables use of automatic user agents -fake_user_agent = None +# Fake user agent. +# Some external websites reject bot-like user agents. It is possible to use +# fake user agents in requests to these websites. +# It is recommended to default this to False and use on an as-needed basis. +# +# Default behaviours in modules that can utilize fake UAs. +# True for enabling fake UA, False for disabling / using pywikibot's own UA, str +# to specify custom UA. +fake_user_agent_default = {'reflinks': False, 'weblinkchecker': False} +# Website domains excepted to the default behaviour. +# True for enabling, False for disabling, str to hardcode a UA. +# Example: {'problematic.site.example': True, +# 'prefers.specific.ua.example': 'snakeoil/4.2'} +fake_user_agent_exceptions = {} +# This following option is deprecated in favour of finer control options above. +fake_user_agent = False # The default interface for communicating with the site # currently the only defined interface is 'APISite', so don't change this! diff --git a/scripts/reflinks.py b/scripts/reflinks.py index c5daf1d..1e930a4 100755 --- a/scripts/reflinks.py +++ b/scripts/reflinks.py @@ -59,6 +59,7 @@ import pywikibot from pywikibot import comms, i18n, pagegenerators, textlib, Bot +from pywikibot import config2 as config from pywikibot.pagegenerators import ( XMLDumpPageGenerator as _XMLDumpPageGenerator, ) @@ -395,8 +396,7 @@ super(ReferencesRobot, self).__init__(**kwargs) self.generator = generator self.site = pywikibot.Site() - self._user_agent = comms.http.get_fake_user_agent() - pywikibot.log('Using fake user agent: {0}'.format(self._user_agent)) + self._use_fake_user_agent = config.fake_user_agent_default.get('reflinks', False) # Check manual = 'mw:Manual:Pywikibot/refLinks' code = None @@ -494,7 +494,6 @@ raise editedpages = 0 - headers = {'user-agent': self._user_agent} for page in self.generator: try: # Load the page's text from the wiki @@ -526,10 +525,11 @@ f = None try: - f = requests.get(ref.url, headers=headers, timeout=60) + f = comms.http.fetch( + ref.url, use_fake_user_agent=self._use_fake_user_agent) # Try to get Content-Type from server - contentType = f.headers.get('content-type') + contentType = f.response_headers.get('content-type') if contentType and not self.MIME.search(contentType): if ref.link.lower().endswith('.pdf') and \ not self.getOption('ignorepdf'): @@ -556,7 +556,7 @@ continue # Get the real url where we end (http redirects !) - redir = f.url + redir = f.data.url if redir != ref.link and \ domain.findall(redir) == domain.findall(link): if soft404.search(redir) and \ @@ -572,15 +572,15 @@ u'Redirect to root : {0} ', ref.link)) continue - if f.status_code != requests.codes.ok: + if f.status != requests.codes.ok: pywikibot.output(u'HTTP error (%s) for %s on %s' - % (f.status_code, ref.url, + % (f.status, ref.url, page.title(asLink=True)), toStdout=True) # 410 Gone, indicates that the resource has been purposely # removed - if f.status_code == 410 or \ - (f.status_code == 404 and (u'\t%s\t' % ref.url in deadLinks)): + if f.status == 410 or \ + (f.status == 404 and (u'\t%s\t' % ref.url in deadLinks)): repl = ref.refDead() new_text = new_text.replace(match.group(), repl) continue diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py index f81d8c3..b8cb323 100755 --- a/scripts/weblinkchecker.py +++ b/scripts/weblinkchecker.py @@ -279,6 +279,8 @@ Returns a (boolean, string) tuple saying if the page is online and including a status reason. + Per-domain user-agent faking is not supported in this deprecated class. + Warning: Also returns false if your Internet connection isn't working correctly! (This will give a Socket Error) @@ -292,11 +294,19 @@ redirectChain is a list of redirects which were resolved by resolveRedirect(). This is needed to detect redirect loops. """ - self._user_agent = comms.http.get_fake_user_agent() self.url = url self.serverEncoding = serverEncoding + + fake_ua_config = config.fake_user_agent_default.get( + 'weblinkchecker', False) + if fake_ua_config and isinstance(fake_ua_config, str): + user_agent = fake_ua_config + elif fake_ua_config: + user_agent = comms.http.fake_user_agent() + else: + user_agent = comms.http.user_agent() self.header = { - 'User-agent': self._user_agent, + 'user-agent': user_agent, 'Accept': 'text/xml,application/xml,application/xhtml+xml,' 'text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language': 'de-de,de;q=0.8,en-us;q=0.5,en;q=0.3', @@ -542,10 +552,8 @@ threading.Thread.__init__(self) self.page = page self.url = url - self._user_agent = comms.http.get_fake_user_agent() self.history = history self.header = { - 'User-agent': self._user_agent, 'Accept': 'text/xml,application/xml,application/xhtml+xml,' 'text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language': 'de-de,de;q=0.8,en-us;q=0.5,en;q=0.3', @@ -557,6 +565,8 @@ self.setName((u'%s - %s' % (page.title(), url)).encode('utf-8', 'replace')) self.HTTPignore = HTTPignore + self._use_fake_user_agent = config.fake_user_agent_default.get( + 'weblinkchecker', False) self.day = day def run(self): @@ -564,8 +574,8 @@ ok = False try: header = self.header - timeout = pywikibot.config.socket_timeout - r = requests.get(self.url, headers=header, timeout=timeout) + r = comms.http.fetch( + self.url, headers=header, use_fake_user_agent=self._use_fake_user_agent) except requests.exceptions.InvalidURL: message = i18n.twtranslate(self.page.site, 'weblinkchecker-badurl_msg', @@ -574,11 +584,11 @@ pywikibot.output('Exception while processing URL %s in page %s' % (self.url, self.page.title())) raise - if (r.status_code == requests.codes.ok and - str(r.status_code) not in self.HTTPignore): + if (r.status == requests.codes.ok and + str(r.status) not in self.HTTPignore): ok = True else: - message = '{0} {1}'.format(r.status_code, r.reason) + message = '{0}'.format(r.status) if ok: if self.history.setLinkAlive(self.url): pywikibot.output('*Link to %s in [[%s]] is back alive.' diff --git a/tests/http_tests.py b/tests/http_tests.py index 05df31b..06f7117 100644 --- a/tests/http_tests.py +++ b/tests/http_tests.py @@ -285,7 +285,7 @@ self.assertIn('Python/' + str(PYTHON_VERSION[0]), http.user_agent()) -class FakeUserAgentTestCase(TestCase): +class DryFakeUserAgentTestCase(TestCase): """Test the generation of fake user agents. @@ -296,15 +296,96 @@ net = False + def _test_fake_user_agent_randomness(self): + """Test if user agent returns are randomized.""" + self.assertNotEqual(http.fake_user_agent(), http.fake_user_agent()) + + @require_modules('browseragents') + def test_with_browseragents(self): + """Test fake user agent generation with browseragents module.""" + self._test_fake_user_agent_randomness() + + @require_modules('fake_useragent') + def test_with_fake_useragent(self): + """Test fake user agent generation with fake_useragent module.""" + self._test_fake_user_agent_randomness() + + +class LiveFakeUserAgentTestCase(TestCase): + + """Test the usage of fake user agent.""" + + sites = { + 'httpbin': { + 'hostname': 'httpbin.org', + }, + } + + def setUp(self): + """Set up the unit test.""" + self.orig_fake_user_agent_exceptions = config.fake_user_agent_exceptions + super(LiveFakeUserAgentTestCase, self).setUp() + + def tearDown(self): + """Tear down unit test.""" + config.fake_user_agent_exceptions = self.orig_fake_user_agent_exceptions + super(LiveFakeUserAgentTestCase, self).tearDown() + + def _test_fetch_use_fake_user_agent(self): + """Test `use_fake_user_agent` argument of http.fetch.""" + # Existing headers + r = http.fetch( + 'http://httpbin.org/status/200', headers={'user-agent': 'EXISTING'}) + self.assertEqual(r.headers['user-agent'], 'EXISTING') + + # Argument value changes + r = http.fetch('http://httpbin.org/status/200', use_fake_user_agent=True) + self.assertNotEqual(r.headers['user-agent'], http.user_agent()) + r = http.fetch('http://httpbin.org/status/200', use_fake_user_agent=False) + self.assertEqual(r.headers['user-agent'], http.user_agent()) + r = http.fetch( + 'http://httpbin.org/status/200', use_fake_user_agent='ARBITRARY') + self.assertEqual(r.headers['user-agent'], 'ARBITRARY') + + # Manually overridden domains + config.fake_user_agent_exceptions = {'httpbin.org': 'OVERRIDDEN'} + r = http.fetch( + 'http://httpbin.org/status/200', use_fake_user_agent=False) + self.assertEqual(r.headers['user-agent'], 'OVERRIDDEN') + + @require_modules('browseragents') + def test_fetch_with_browseragents(self): + """Test method with browseragents module.""" + self._test_fetch_use_fake_user_agent() + + @require_modules('fake_useragent') + def test_fetch_with_fake_useragent(self): + """Test method with fake_useragent module.""" + self._test_fetch_use_fake_user_agent() + + +class GetFakeUserAgentTestCase(TestCase): + + """Test the deprecated get_fake_user_agent().""" + + net = False + def setUp(self): """Set up unit test.""" self.orig_fake_user_agent = config.fake_user_agent + super(GetFakeUserAgentTestCase, self).setUp() def tearDown(self): """Tear down unit test.""" config.fake_user_agent = self.orig_fake_user_agent + super(GetFakeUserAgentTestCase, self).tearDown() - def _test_fake_user_agent_config(self): + def _test_fake_user_agent_randomness(self): + """Test if user agent returns are randomized.""" + config.fake_user_agent = True + self.assertNotEqual(http.get_fake_user_agent(), http.get_fake_user_agent()) + + def _test_config_settings(self): """Test if method honours configuration toggle.""" # ON: True and None in config are considered turned on. config.fake_user_agent = True @@ -315,25 +396,20 @@ # OFF: All other values won't make it return random UA. config.fake_user_agent = False self.assertEqual(http.get_fake_user_agent(), http.user_agent()) - config.fake_user_agent = 'ArbitraryValue' - self.assertEqual(http.get_fake_user_agent(), 'ArbitraryValue') - - def _test_fake_user_agent_randomness(self): - """Test if user agent returns are randomized.""" - config.fake_user_agent = True - self.assertNotEqual(http.get_fake_user_agent(), http.get_fake_user_agent()) + config.fake_user_agent = 'ARBITRARY' + self.assertEqual(http.get_fake_user_agent(), 'ARBITRARY') @require_modules('browseragents') def test_with_browseragents(self): - """Test fake user agent generation with browseragents module.""" - self._test_fake_user_agent_config() + """Test method with browseragents module.""" self._test_fake_user_agent_randomness() + self._test_config_settings() @require_modules('fake_useragent') def test_with_fake_useragent(self): - """Test fake user agent generation with fake_useragent module.""" - self._test_fake_user_agent_config() + """Test method with fake_useragent module.""" self._test_fake_user_agent_randomness() + self._test_config_settings() class CharsetTestCase(TestCase): -- To view, visit https://gerrit.wikimedia.org/r/325241 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I28594fd1b5ccb6ed3e885db5600bb0464dccfa0e Gerrit-PatchSet: 17 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Dargasia <t...@riseup.net> Gerrit-Reviewer: Dargasia <t...@riseup.net> Gerrit-Reviewer: John Vandenberg <jay...@gmail.com> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits