jenkins-bot has submitted this change and it was merged.
Change subject: [FEAT] Fully flexible Site(url)
......................................................................
[FEAT] Fully flexible Site(url)
It completely redesigns how it looks up sites using an URL. Instead of creating
a regex which parses any combination of sites possible it first looks through
the languages in a family to determine which codes use the same domain. After
that it checks if the site uses one the article path or the index.php path.
This uses a Site instance instead of the Family properties to make it fully
flexible and remove the necessity of nicepath. This also removes all regex
related methods in Family.
It fixes a typo in `test_each_family` as it didn't add a separator between the
index.php file and the placeholder. It also removes the test_family from the
skipped entries, as it'll skip test sites anyway (as they aren't in `codes`) so
that it now can match the test family instead.
Bug: T85658
Change-Id: I4cc8bd7b008f0220d87e6a61dd63e4e50194791d
---
M pywikibot/__init__.py
M pywikibot/families/test_family.py
M pywikibot/family.py
M tests/family_tests.py
M tests/link_tests.py
5 files changed, 135 insertions(+), 176 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/__init__.py b/pywikibot/__init__.py
index b1d2cc3..64b2f07 100644
--- a/pywikibot/__init__.py
+++ b/pywikibot/__init__.py
@@ -559,31 +559,33 @@
_logger = "wiki"
if url:
- if url in _url_cache:
- cached = _url_cache[url]
- if cached:
- code = cached[0]
- fam = cached[1]
- else:
- raise SiteDefinitionError("Unknown URL '{0}'.".format(url))
- else:
+ if url not in _url_cache:
+ matched_sites = []
# Iterate through all families and look, which does apply to
# the given URL
for fam in config.family_files:
- try:
- family = pywikibot.family.Family.load(fam)
- code = family.from_url(url)
- if code:
- _url_cache[url] = (code, fam)
- break
- except Exception as e:
- pywikibot.warning('Error in Family(%s).from_url: %s'
- % (fam, e))
+ family = pywikibot.family.Family.load(fam)
+ code = family.from_url(url)
+ if code is not None:
+ matched_sites += [(code, fam)]
+
+ if matched_sites:
+ if len(matched_sites) > 1:
+ pywikibot.warning(
+ 'Found multiple matches for URL "{0}": {1} (use first)'
+ .format(url, ', '.join(str(s) for s in matched_sites)))
+ _url_cache[url] = matched_sites[0]
else:
- _url_cache[url] = None
# TODO: As soon as AutoFamily is ready, try and use an
# AutoFamily
- raise SiteDefinitionError("Unknown URL '{0}'.".format(url))
+ _url_cache[url] = None
+
+ cached = _url_cache[url]
+ if cached:
+ code = cached[0]
+ fam = cached[1]
+ else:
+ raise SiteDefinitionError("Unknown URL '{0}'.".format(url))
else:
# Fallback to config defaults
code = code or config.mylang
diff --git a/pywikibot/families/test_family.py
b/pywikibot/families/test_family.py
index 87bdaf1..fa26d62 100644
--- a/pywikibot/families/test_family.py
+++ b/pywikibot/families/test_family.py
@@ -14,7 +14,3 @@
name = 'test'
domain = 'test.wikipedia.org'
-
- def from_url(self, url):
- """Return None to indicate no code of this family is accepted."""
- return None # Don't accept this, but 'test' of 'wikipedia'
diff --git a/pywikibot/family.py b/pywikibot/family.py
index 66b676d..e933d6f 100644
--- a/pywikibot/family.py
+++ b/pywikibot/family.py
@@ -1060,13 +1060,19 @@
# Override this ONLY if the wiki family requires a path prefix
return ''
- def base_url(self, code, uri):
+ def _hostname(self, code):
+ """Return the protocol and hostname."""
protocol = self.protocol(code)
if protocol == 'https':
host = self.ssl_hostname(code)
- uri = self.ssl_pathprefix(code) + uri
else:
host = self.hostname(code)
+ return protocol, host
+
+ def base_url(self, code, uri):
+ protocol, host = self._hostname(code)
+ if protocol == 'https':
+ uri = self.ssl_pathprefix(code) + uri
return urlparse.urljoin('{0}://{1}'.format(protocol, host), uri)
def path(self, code):
@@ -1078,38 +1084,9 @@
def apipath(self, code):
return '%s/api.php' % self.scriptpath(code)
- # TODO: @deprecated('APISite.article_path')
- # As soon as from_url does not need nicepath anymore
+ @deprecated('APISite.article_path')
def nicepath(self, code):
return '/wiki/'
-
- def _get_path_regex(self, code):
- """
- Return a regex matching a site URL path.
-
- @return: regex string
- @rtype: unicode
- """
- # The trailing slash after path(code) is optional.
- return ('(?:%s?|%s)' %
- (re.escape(self.path(code) + '/'),
- re.escape(self.nicepath(code))))
-
- def _get_url_regex(self, code):
- """
- Return a regex matching a site URL.
-
- Regex match group 1 is the domain.
-
- Does not make use of ssl_hostname or ssl_pathprefix.
-
- @return: regex string
- @rtype: unicode
- """
- return (r'(?:\/\/|%s\:\/\/)(%s)%s' %
- (self.protocol(code),
- re.escape(self.hostname(code)),
- self._get_path_regex(code)))
def rcstream_host(self, code):
raise NotImplementedError("This family does not support RCStream")
@@ -1122,119 +1099,82 @@
def nice_get_address(self, code, title):
return '%s%s' % (self.nicepath(code), title)
- def _get_regex_all(self):
- """
- Return a regex matching any site.
-
- It is using Family methods with code set to 'None' initially.
- That will raise KeyError if the Family methods use the code to
- lookup the correct value in a dictionary such as C{langs}.
- On KeyError, it retries it with each key from C{langs}.
-
- @return: regex string
- @rtype: unicode
- """
- if hasattr(self, '_regex_all'):
- return self._regex_all
-
- try:
- self._regex_all = self._get_url_regex(None)
- return self._regex_all
- except KeyError:
- # Probably automatically generated family
- pass
-
- # If there is only one code, use it.
- if len(self.langs) == 1:
- code = next(iter(self.langs.keys()))
- self._regex_all = self._get_url_regex(code)
- return self._regex_all
-
- try:
- protocol = self.protocol(None) + '\:\/\/'
- except KeyError:
- protocol = None
-
- try:
- hostname = re.escape(self.hostname(None))
- except KeyError:
- hostname = None
-
- try:
- path = self._get_path_regex(None)
- except KeyError:
- path = None
-
- # If two or more of the three above varies, the regex cant be optimised
- none_count = [protocol, hostname, path].count(None)
-
- if none_count > 1:
- self._regex_all = ('(?:%s)'
- % '|'.join(self._get_url_regex(code)
- for code in self.langs.keys()))
- return self._regex_all
-
- if not protocol:
- protocols = set(self.protocol(code) + '\:\/\/'
- for code in self.langs.keys())
- protocol = '|'.join(protocols)
-
- # Allow protocol neutral '//'
- protocol = '(?:\/\/|%s)' % protocol
-
- if not hostname:
- hostnames = set(re.escape(self.hostname(code))
- for code in self.langs.keys())
- hostname = '|'.join(hostnames)
-
- # capture hostname
- hostname = '(' + hostname + ')'
-
- if not path:
- regexes = set(self._get_path_regex(code)
- for code in self.langs.keys())
- path = '(?:%s)' % '|'.join(regexes)
-
- self._regex_all = protocol + hostname + path
- return self._regex_all
+ # List of codes which aren't returned by from_url; True returns None always
+ _ignore_from_url = []
def from_url(self, url):
"""
Return whether this family matches the given url.
- It must match URLs generated via C{self.langs} and
- L{Family.nice_get_address} or L{Family.path}. If the protocol doesn't
- match but is present in the interwikimap it'll log this.
+ It is first checking if a domain of this family is in the the domain of
+ the URL. If that is the case it's checking all codes and verifies that
+ a path generated via L{APISite.article_path} and L{Family.path} matches
+ the path of the URL together with the hostname for that code.
- It ignores $1 in the url, and anything that follows it.
+ It is using L{Family.domains} to first check if a domain applies and
+ then iterates over L{Family.codes} to actually determine which code
+ applies.
+ @param url: the URL which may contain a C{$1}. If it's missing it is
+ assumed to be at the end and if it's present nothing is allowed
+ after it.
+ @type url: str
@return: The language code of the url. None if that url is not from
this family.
@rtype: str or None
- @raises RuntimeError: Mismatch between Family langs dictionary and
- URL regex.
+ @raises RuntimeError: When there are multiple languages in this family
+ which would work with the given URL.
+ @raises ValueError: When text is present after $1.
"""
- if '$1' in url:
- url = url[:url.find('$1')]
-
- url_match = re.match(self._get_regex_all(), url)
- if not url_match:
+ if self._ignore_from_url is True:
return None
+ else:
+ ignored = self._ignore_from_url
- for code, domain in self.langs.items():
- if domain is None:
- warn('Family(%s): langs missing domain names' % self.name,
- FamilyMaintenanceWarning)
- elif domain == url_match.group(1):
- return code
+ parsed = urlparse.urlparse(url)
+ if not re.match('^(https?)?$', parsed.scheme):
+ return None
+ path = parsed.path
+ if parsed.query:
+ path += '?' + parsed.query
- # if domain was None, this will return the only possible code.
- if len(self.langs) == 1:
- return next(iter(self.langs))
+ # Discard $1 and everything after it
+ path, _, suffix = path.partition('$1')
+ if suffix:
+ raise ValueError('Text after the $1 placeholder is not supported '
+ '(T111513).')
- raise RuntimeError(
- 'Family(%s): matched regex has not matched a domain in langs'
- % self.name)
+ matched_sites = []
+ for domain in self.domains:
+ if domain in parsed.netloc:
+ break
+ else:
+ domain = False
+ if domain is not False:
+ for code in self.codes:
+ if code in ignored:
+ continue
+ if self._hostname(code)[1] == parsed.netloc:
+ # Use the code and family instead of the url
+ # This is only creating a Site instance if domain matches
+ site = pywikibot.Site(code, self.name)
+ pywikibot.log('Found candidate {0}'.format(site))
+ site_paths = [site.path()] * 3
+ site_paths[1] += '/'
+ site_paths[2] += '?title='
+ site_paths += [site.article_path]
+
+ if path in site_paths:
+ matched_sites += [site]
+
+ if len(matched_sites) == 1:
+ return matched_sites[0].code
+ elif not matched_sites:
+ return None
+ else:
+ raise RuntimeError(
+ 'Found multiple matches for URL "{0}": {1}'
+ .format(url, ', '.join(str(s) for s in matched_sites)))
def maximum_GET_length(self, code):
return config.maximum_GET_length
diff --git a/tests/family_tests.py b/tests/family_tests.py
index 3f0b008..bb5b11f 100644
--- a/tests/family_tests.py
+++ b/tests/family_tests.py
@@ -19,7 +19,9 @@
unittest,
TestCase,
DeprecationTestCase,
+ PatchingTestCase,
)
+from tests.utils import DrySite
if not PY2:
basestring = (str, )
@@ -143,23 +145,34 @@
{'a': 'b', 'c': None})
-class TestFamilyUrlRegex(TestCase):
+class TestFamilyUrlRegex(PatchingTestCase):
"""Test family URL regex."""
net = False
- def test_get_regex_wikipedia_precise(self):
- """Test the family regex is optimal."""
- f = Family.load('wikipedia')
- regex = f._get_regex_all()
+ @PatchingTestCase.patched(pywikibot, 'Site')
+ def Site(self, code, fam, *args, **kwargs):
+ """Own DrySite creator."""
+ self.assertEqual(args, tuple())
+ self.assertEqual(kwargs, {})
+ self.assertEqual(code, self.current_code)
+ self.assertEqual(fam, self.current_family)
+ site = DrySite(code, fam, None, None)
+ site._siteinfo._cache['general'] = ({'articlepath': self.article_path},
+ True)
+ return site
- self.assertTrue(regex.startswith('(?:\/\/|https\:\/\/)('))
- self.assertIn('vo\.wikipedia\.org', regex)
- self.assertTrue(regex.endswith(')(?:\/w\/index\.php\/?|\/wiki\/)'))
+ def setUp(self):
+ """Setup default article path."""
+ super(TestFamilyUrlRegex, self).setUp()
+ self.article_path = '/wiki/$1'
def test_from_url_wikipedia_extra(self):
"""Test various URLs against wikipedia regex."""
+ self.current_code = 'vo'
+ self.current_family = 'wikipedia'
+
f = Family.load('wikipedia')
prefix = 'https://vo.wikipedia.org'
@@ -171,13 +184,18 @@
self.assertEqual(f.from_url(prefix + '/wiki/$1'), 'vo')
self.assertEqual(f.from_url('//vo.wikipedia.org/wiki/$1'), 'vo')
- self.assertEqual(f.from_url('//vo.wikipedia.org/wiki/$1/foo'), 'vo')
self.assertEqual(f.from_url(prefix + '/w/index.php/$1'), 'vo')
self.assertEqual(f.from_url('//vo.wikipedia.org/wiki/$1'), 'vo')
- self.assertEqual(f.from_url('//vo.wikipedia.org/wiki/$1/foo'), 'vo')
+
+ # Text after $1 is not allowed
+ self.assertRaises(ValueError, f.from_url,
+ '//vo.wikipedia.org/wiki/$1/foo')
+
+ # the IWM may contain the wrong protocol, but it's only used to
+ # determine a site so using HTTP or HTTPS is not an issue
+ self.assertEqual(f.from_url('http://vo.wikipedia.org/wiki/$1'), 'vo')
# wrong protocol
- self.assertIsNone(f.from_url('http://vo.wikipedia.org/wiki/$1'))
self.assertIsNone(f.from_url('ftp://vo.wikipedia.org/wiki/$1'))
# wrong code
self.assertIsNone(f.from_url('https://foobar.wikipedia.org/wiki/$1'))
@@ -191,16 +209,19 @@
def test_each_family(self):
"""Test each family builds a working regex."""
for family in pywikibot.config.family_files:
+ self.current_family = family
family = Family.load(family)
- # Test family does not respond to from_url due to overlap
- # with Wikipedia family.
- if family.name == 'test':
- continue
- for code in family.langs:
- url = ('%s://%s%s$1' % (family.protocol(code),
- family.hostname(code),
- family.path(code)))
- self.assertEqual(family.from_url(url), code)
+ for code in family.codes:
+ self.current_code = code
+ url = ('%s://%s%s/$1' % (family.protocol(code),
+ family.hostname(code),
+ family.path(code)))
+ # Families can switch off if they want to be detected using URL
+ # this applies for test:test (there is test:wikipedia)
+ if family._ignore_from_url or code in family._ignore_from_url:
+ self.assertIsNone(family.from_url(url))
+ else:
+ self.assertEqual(family.from_url(url), code)
class TestOldFamilyMethod(DeprecationTestCase):
diff --git a/tests/link_tests.py b/tests/link_tests.py
index 0b134c5..8176057 100644
--- a/tests/link_tests.py
+++ b/tests/link_tests.py
@@ -477,7 +477,7 @@
'code': 'en'
},
'test.wp': {
- 'family': 'wikipedia',
+ 'family': 'test',
'code': 'test'
},
}
@@ -840,7 +840,7 @@
config.family = 'wikipedia'
link = Link('wikidata:testwiki:Q6')
link.parse()
- self.assertEqual(link.site, pywikibot.Site('test', 'wikipedia'))
+ self.assertEqual(link.site, pywikibot.Site('test', 'test'))
self.assertEqual(link.title, 'Q6')
self.assertEqual(link.namespace, 0)
@@ -850,7 +850,7 @@
config.family = 'wikipedia'
link = Link('wikidata:testwiki:Talk:Q6')
link.parse()
- self.assertEqual(link.site, pywikibot.Site('test', 'wikipedia'))
+ self.assertEqual(link.site, pywikibot.Site('test', 'test'))
self.assertEqual(link.title, 'Q6')
self.assertEqual(link.namespace, 1)
--
To view, visit https://gerrit.wikimedia.org/r/221448
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I4cc8bd7b008f0220d87e6a61dd63e4e50194791d
Gerrit-PatchSet: 16
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <[email protected]>
Gerrit-Reviewer: John Vandenberg <[email protected]>
Gerrit-Reviewer: Ladsgroup <[email protected]>
Gerrit-Reviewer: XZise <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits