jenkins-bot has submitted this change and it was merged.

Change subject: [FEAT] Fully flexible Site(url)
......................................................................


[FEAT] Fully flexible Site(url)

It completely redesigns how it looks up sites using an URL. Instead of creating
a regex which parses any combination of sites possible it first looks through
the languages in a family to determine which codes use the same domain. After
that it checks if the site uses one the article path or the index.php path.

This uses a Site instance instead of the Family properties to make it fully
flexible and remove the necessity of nicepath. This also removes all regex
related methods in Family.

It fixes a typo in `test_each_family` as it didn't add a separator between the
index.php file and the placeholder. It also removes the test_family from the
skipped entries, as it'll skip test sites anyway (as they aren't in `codes`) so
that it now can match the test family instead.

Bug: T85658
Change-Id: I4cc8bd7b008f0220d87e6a61dd63e4e50194791d
---
M pywikibot/__init__.py
M pywikibot/families/test_family.py
M pywikibot/family.py
M tests/family_tests.py
M tests/link_tests.py
5 files changed, 135 insertions(+), 176 deletions(-)

Approvals:
  John Vandenberg: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/pywikibot/__init__.py b/pywikibot/__init__.py
index b1d2cc3..64b2f07 100644
--- a/pywikibot/__init__.py
+++ b/pywikibot/__init__.py
@@ -559,31 +559,33 @@
     _logger = "wiki"
 
     if url:
-        if url in _url_cache:
-            cached = _url_cache[url]
-            if cached:
-                code = cached[0]
-                fam = cached[1]
-            else:
-                raise SiteDefinitionError("Unknown URL '{0}'.".format(url))
-        else:
+        if url not in _url_cache:
+            matched_sites = []
             # Iterate through all families and look, which does apply to
             # the given URL
             for fam in config.family_files:
-                try:
-                    family = pywikibot.family.Family.load(fam)
-                    code = family.from_url(url)
-                    if code:
-                        _url_cache[url] = (code, fam)
-                        break
-                except Exception as e:
-                    pywikibot.warning('Error in Family(%s).from_url: %s'
-                                      % (fam, e))
+                family = pywikibot.family.Family.load(fam)
+                code = family.from_url(url)
+                if code is not None:
+                    matched_sites += [(code, fam)]
+
+            if matched_sites:
+                if len(matched_sites) > 1:
+                    pywikibot.warning(
+                        'Found multiple matches for URL "{0}": {1} (use first)'
+                        .format(url, ', '.join(str(s) for s in matched_sites)))
+                _url_cache[url] = matched_sites[0]
             else:
-                _url_cache[url] = None
                 # TODO: As soon as AutoFamily is ready, try and use an
                 #       AutoFamily
-                raise SiteDefinitionError("Unknown URL '{0}'.".format(url))
+                _url_cache[url] = None
+
+        cached = _url_cache[url]
+        if cached:
+            code = cached[0]
+            fam = cached[1]
+        else:
+            raise SiteDefinitionError("Unknown URL '{0}'.".format(url))
     else:
         # Fallback to config defaults
         code = code or config.mylang
diff --git a/pywikibot/families/test_family.py 
b/pywikibot/families/test_family.py
index 87bdaf1..fa26d62 100644
--- a/pywikibot/families/test_family.py
+++ b/pywikibot/families/test_family.py
@@ -14,7 +14,3 @@
 
     name = 'test'
     domain = 'test.wikipedia.org'
-
-    def from_url(self, url):
-        """Return None to indicate no code of this family is accepted."""
-        return None  # Don't accept this, but 'test' of 'wikipedia'
diff --git a/pywikibot/family.py b/pywikibot/family.py
index 66b676d..e933d6f 100644
--- a/pywikibot/family.py
+++ b/pywikibot/family.py
@@ -1060,13 +1060,19 @@
         # Override this ONLY if the wiki family requires a path prefix
         return ''
 
-    def base_url(self, code, uri):
+    def _hostname(self, code):
+        """Return the protocol and hostname."""
         protocol = self.protocol(code)
         if protocol == 'https':
             host = self.ssl_hostname(code)
-            uri = self.ssl_pathprefix(code) + uri
         else:
             host = self.hostname(code)
+        return protocol, host
+
+    def base_url(self, code, uri):
+        protocol, host = self._hostname(code)
+        if protocol == 'https':
+            uri = self.ssl_pathprefix(code) + uri
         return urlparse.urljoin('{0}://{1}'.format(protocol, host), uri)
 
     def path(self, code):
@@ -1078,38 +1084,9 @@
     def apipath(self, code):
         return '%s/api.php' % self.scriptpath(code)
 
-    # TODO: @deprecated('APISite.article_path')
-    # As soon as from_url does not need nicepath anymore
+    @deprecated('APISite.article_path')
     def nicepath(self, code):
         return '/wiki/'
-
-    def _get_path_regex(self, code):
-        """
-        Return a regex matching a site URL path.
-
-        @return: regex string
-        @rtype: unicode
-        """
-        # The trailing slash after path(code) is optional.
-        return ('(?:%s?|%s)' %
-                (re.escape(self.path(code) + '/'),
-                 re.escape(self.nicepath(code))))
-
-    def _get_url_regex(self, code):
-        """
-        Return a regex matching a site URL.
-
-        Regex match group 1 is the domain.
-
-        Does not make use of ssl_hostname or ssl_pathprefix.
-
-        @return: regex string
-        @rtype: unicode
-        """
-        return (r'(?:\/\/|%s\:\/\/)(%s)%s' %
-                (self.protocol(code),
-                 re.escape(self.hostname(code)),
-                 self._get_path_regex(code)))
 
     def rcstream_host(self, code):
         raise NotImplementedError("This family does not support RCStream")
@@ -1122,119 +1099,82 @@
     def nice_get_address(self, code, title):
         return '%s%s' % (self.nicepath(code), title)
 
-    def _get_regex_all(self):
-        """
-        Return a regex matching any site.
-
-        It is using Family methods with code set to 'None' initially.
-        That will raise KeyError if the Family methods use the code to
-        lookup the correct value in a dictionary such as C{langs}.
-        On KeyError, it retries it with each key from C{langs}.
-
-        @return: regex string
-        @rtype: unicode
-        """
-        if hasattr(self, '_regex_all'):
-            return self._regex_all
-
-        try:
-            self._regex_all = self._get_url_regex(None)
-            return self._regex_all
-        except KeyError:
-            # Probably automatically generated family
-            pass
-
-        # If there is only one code, use it.
-        if len(self.langs) == 1:
-            code = next(iter(self.langs.keys()))
-            self._regex_all = self._get_url_regex(code)
-            return self._regex_all
-
-        try:
-            protocol = self.protocol(None) + '\:\/\/'
-        except KeyError:
-            protocol = None
-
-        try:
-            hostname = re.escape(self.hostname(None))
-        except KeyError:
-            hostname = None
-
-        try:
-            path = self._get_path_regex(None)
-        except KeyError:
-            path = None
-
-        # If two or more of the three above varies, the regex cant be optimised
-        none_count = [protocol, hostname, path].count(None)
-
-        if none_count > 1:
-            self._regex_all = ('(?:%s)'
-                               % '|'.join(self._get_url_regex(code)
-                                          for code in self.langs.keys()))
-            return self._regex_all
-
-        if not protocol:
-            protocols = set(self.protocol(code) + '\:\/\/'
-                            for code in self.langs.keys())
-            protocol = '|'.join(protocols)
-
-        # Allow protocol neutral '//'
-        protocol = '(?:\/\/|%s)' % protocol
-
-        if not hostname:
-            hostnames = set(re.escape(self.hostname(code))
-                            for code in self.langs.keys())
-            hostname = '|'.join(hostnames)
-
-        # capture hostname
-        hostname = '(' + hostname + ')'
-
-        if not path:
-            regexes = set(self._get_path_regex(code)
-                          for code in self.langs.keys())
-            path = '(?:%s)' % '|'.join(regexes)
-
-        self._regex_all = protocol + hostname + path
-        return self._regex_all
+    # List of codes which aren't returned by from_url; True returns None always
+    _ignore_from_url = []
 
     def from_url(self, url):
         """
         Return whether this family matches the given url.
 
-        It must match URLs generated via C{self.langs} and
-        L{Family.nice_get_address} or L{Family.path}. If the protocol doesn't
-        match but is present in the interwikimap it'll log this.
+        It is first checking if a domain of this family is in the the domain of
+        the URL. If that is the case it's checking all codes and verifies that
+        a path generated via L{APISite.article_path} and L{Family.path} matches
+        the path of the URL together with the hostname for that code.
 
-        It ignores $1 in the url, and anything that follows it.
+        It is using L{Family.domains} to first check if a domain applies and
+        then iterates over L{Family.codes} to actually determine which code
+        applies.
 
+        @param url: the URL which may contain a C{$1}. If it's missing it is
+            assumed to be at the end and if it's present nothing is allowed
+            after it.
+        @type url: str
         @return: The language code of the url. None if that url is not from
             this family.
         @rtype: str or None
-        @raises RuntimeError: Mismatch between Family langs dictionary and
-            URL regex.
+        @raises RuntimeError: When there are multiple languages in this family
+            which would work with the given URL.
+        @raises ValueError: When text is present after $1.
         """
-        if '$1' in url:
-            url = url[:url.find('$1')]
-
-        url_match = re.match(self._get_regex_all(), url)
-        if not url_match:
+        if self._ignore_from_url is True:
             return None
+        else:
+            ignored = self._ignore_from_url
 
-        for code, domain in self.langs.items():
-            if domain is None:
-                warn('Family(%s): langs missing domain names' % self.name,
-                     FamilyMaintenanceWarning)
-            elif domain == url_match.group(1):
-                return code
+        parsed = urlparse.urlparse(url)
+        if not re.match('^(https?)?$', parsed.scheme):
+            return None
+        path = parsed.path
+        if parsed.query:
+            path += '?' + parsed.query
 
-        # if domain was None, this will return the only possible code.
-        if len(self.langs) == 1:
-            return next(iter(self.langs))
+        # Discard $1 and everything after it
+        path, _, suffix = path.partition('$1')
+        if suffix:
+            raise ValueError('Text after the $1 placeholder is not supported '
+                             '(T111513).')
 
-        raise RuntimeError(
-            'Family(%s): matched regex has not matched a domain in langs'
-            % self.name)
+        matched_sites = []
+        for domain in self.domains:
+            if domain in parsed.netloc:
+                break
+        else:
+            domain = False
+        if domain is not False:
+            for code in self.codes:
+                if code in ignored:
+                    continue
+                if self._hostname(code)[1] == parsed.netloc:
+                    # Use the code and family instead of the url
+                    # This is only creating a Site instance if domain matches
+                    site = pywikibot.Site(code, self.name)
+                    pywikibot.log('Found candidate {0}'.format(site))
+                    site_paths = [site.path()] * 3
+                    site_paths[1] += '/'
+                    site_paths[2] += '?title='
+                    site_paths += [site.article_path]
+
+                    if path in site_paths:
+                        matched_sites += [site]
+
+        if len(matched_sites) == 1:
+            return matched_sites[0].code
+        elif not matched_sites:
+            return None
+        else:
+            raise RuntimeError(
+                'Found multiple matches for URL "{0}": {1}'
+                .format(url, ', '.join(str(s) for s in matched_sites)))
 
     def maximum_GET_length(self, code):
         return config.maximum_GET_length
diff --git a/tests/family_tests.py b/tests/family_tests.py
index 3f0b008..bb5b11f 100644
--- a/tests/family_tests.py
+++ b/tests/family_tests.py
@@ -19,7 +19,9 @@
     unittest,
     TestCase,
     DeprecationTestCase,
+    PatchingTestCase,
 )
+from tests.utils import DrySite
 
 if not PY2:
     basestring = (str, )
@@ -143,23 +145,34 @@
                           {'a': 'b', 'c': None})
 
 
-class TestFamilyUrlRegex(TestCase):
+class TestFamilyUrlRegex(PatchingTestCase):
 
     """Test family URL regex."""
 
     net = False
 
-    def test_get_regex_wikipedia_precise(self):
-        """Test the family regex is optimal."""
-        f = Family.load('wikipedia')
-        regex = f._get_regex_all()
+    @PatchingTestCase.patched(pywikibot, 'Site')
+    def Site(self, code, fam, *args, **kwargs):
+        """Own DrySite creator."""
+        self.assertEqual(args, tuple())
+        self.assertEqual(kwargs, {})
+        self.assertEqual(code, self.current_code)
+        self.assertEqual(fam, self.current_family)
+        site = DrySite(code, fam, None, None)
+        site._siteinfo._cache['general'] = ({'articlepath': self.article_path},
+                                            True)
+        return site
 
-        self.assertTrue(regex.startswith('(?:\/\/|https\:\/\/)('))
-        self.assertIn('vo\.wikipedia\.org', regex)
-        self.assertTrue(regex.endswith(')(?:\/w\/index\.php\/?|\/wiki\/)'))
+    def setUp(self):
+        """Setup default article path."""
+        super(TestFamilyUrlRegex, self).setUp()
+        self.article_path = '/wiki/$1'
 
     def test_from_url_wikipedia_extra(self):
         """Test various URLs against wikipedia regex."""
+        self.current_code = 'vo'
+        self.current_family = 'wikipedia'
+
         f = Family.load('wikipedia')
 
         prefix = 'https://vo.wikipedia.org'
@@ -171,13 +184,18 @@
 
         self.assertEqual(f.from_url(prefix + '/wiki/$1'), 'vo')
         self.assertEqual(f.from_url('//vo.wikipedia.org/wiki/$1'), 'vo')
-        self.assertEqual(f.from_url('//vo.wikipedia.org/wiki/$1/foo'), 'vo')
         self.assertEqual(f.from_url(prefix + '/w/index.php/$1'), 'vo')
         self.assertEqual(f.from_url('//vo.wikipedia.org/wiki/$1'), 'vo')
-        self.assertEqual(f.from_url('//vo.wikipedia.org/wiki/$1/foo'), 'vo')
+
+        # Text after $1 is not allowed
+        self.assertRaises(ValueError, f.from_url,
+                          '//vo.wikipedia.org/wiki/$1/foo')
+
+        # the IWM may contain the wrong protocol, but it's only used to
+        # determine a site so using HTTP or HTTPS is not an issue
+        self.assertEqual(f.from_url('http://vo.wikipedia.org/wiki/$1'), 'vo')
 
         # wrong protocol
-        self.assertIsNone(f.from_url('http://vo.wikipedia.org/wiki/$1'))
         self.assertIsNone(f.from_url('ftp://vo.wikipedia.org/wiki/$1'))
         # wrong code
         self.assertIsNone(f.from_url('https://foobar.wikipedia.org/wiki/$1'))
@@ -191,16 +209,19 @@
     def test_each_family(self):
         """Test each family builds a working regex."""
         for family in pywikibot.config.family_files:
+            self.current_family = family
             family = Family.load(family)
-            # Test family does not respond to from_url due to overlap
-            # with Wikipedia family.
-            if family.name == 'test':
-                continue
-            for code in family.langs:
-                url = ('%s://%s%s$1' % (family.protocol(code),
-                                        family.hostname(code),
-                                        family.path(code)))
-                self.assertEqual(family.from_url(url), code)
+            for code in family.codes:
+                self.current_code = code
+                url = ('%s://%s%s/$1' % (family.protocol(code),
+                                         family.hostname(code),
+                                         family.path(code)))
+                # Families can switch off if they want to be detected using URL
+                # this applies for test:test (there is test:wikipedia)
+                if family._ignore_from_url or code in family._ignore_from_url:
+                    self.assertIsNone(family.from_url(url))
+                else:
+                    self.assertEqual(family.from_url(url), code)
 
 
 class TestOldFamilyMethod(DeprecationTestCase):
diff --git a/tests/link_tests.py b/tests/link_tests.py
index 0b134c5..8176057 100644
--- a/tests/link_tests.py
+++ b/tests/link_tests.py
@@ -477,7 +477,7 @@
             'code': 'en'
         },
         'test.wp': {
-            'family': 'wikipedia',
+            'family': 'test',
             'code': 'test'
         },
     }
@@ -840,7 +840,7 @@
         config.family = 'wikipedia'
         link = Link('wikidata:testwiki:Q6')
         link.parse()
-        self.assertEqual(link.site, pywikibot.Site('test', 'wikipedia'))
+        self.assertEqual(link.site, pywikibot.Site('test', 'test'))
         self.assertEqual(link.title, 'Q6')
         self.assertEqual(link.namespace, 0)
 
@@ -850,7 +850,7 @@
         config.family = 'wikipedia'
         link = Link('wikidata:testwiki:Talk:Q6')
         link.parse()
-        self.assertEqual(link.site, pywikibot.Site('test', 'wikipedia'))
+        self.assertEqual(link.site, pywikibot.Site('test', 'test'))
         self.assertEqual(link.title, 'Q6')
         self.assertEqual(link.namespace, 1)
 

-- 
To view, visit https://gerrit.wikimedia.org/r/221448
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I4cc8bd7b008f0220d87e6a61dd63e4e50194791d
Gerrit-PatchSet: 16
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <[email protected]>
Gerrit-Reviewer: John Vandenberg <[email protected]>
Gerrit-Reviewer: Ladsgroup <[email protected]>
Gerrit-Reviewer: XZise <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits

Reply via email to