jenkins-bot has submitted this change and it was merged.

Change subject: Bug 69315 - cs timestamp not supported
......................................................................


Bug 69315 - cs timestamp not supported

cswiki uses the  21. 5. 2014, 17:07 (UTC) timestamp.

Once month is matched, days from 1-12 are wiped out by
last_match_and_replace().
Changed last_match_and_replace() to replace only the first N-2 and
the last match, leaving the N-1 as day candidate.

Modified also:
- fixed a bug in monthR regex: if month name contains a dot, it needs to
be escaped. It happened e.g. on wikipedia:no.
Also raise Keyerror exception that was masking this bug.

- regex for day to include the dot (not mandatory but
improves robustness of the search).

- aligned names of self.timeznR->ptimeznR and self.yearR->pyearR for
consistency with naming convention.

- fixed a pep8 error at line 859

Change-Id: Ifee1041cf9762f419c48fc6cb0faa56b84d0bee4
---
M pywikibot/textlib.py
M tests/archivebot_tests.py
M tests/timestripper_tests.py
3 files changed, 129 insertions(+), 15 deletions(-)

Approvals:
  John Vandenberg: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index d0ac855..0b56f68 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -856,7 +856,7 @@
     else:
         sep = config.line_separator
     # Some people don't like the categories sorted
-    #catLinks.sort()
+    # catLinks.sort()
     return sep.join(catLinks) + config.line_separator
 
 
@@ -1181,6 +1181,8 @@
         for n, (_long, _short) in enumerate(self.site.months_names, start=1):
             self.origNames2monthNum[_long] = n
             self.origNames2monthNum[_short] = n
+            # in some cases month in ~~~~ might end without dot even if
+            # site.months_names do not.
             if _short.endswith('.'):
                 self.origNames2monthNum[_short[:-1]] = n
 
@@ -1189,20 +1191,22 @@
         timeR = r'(?P<time>(?P<hour>[0-2]\d)[:\.h](?P<minute>[0-5]\d))'
         timeznR = r'\((?P<tzinfo>[A-Z]+)\)'
         yearR = r'(?P<year>(19|20)\d\d)'
-        monthR = r'(?P<month>(%s))' % (u'|'.join(self.origNames2monthNum))
-        dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9]))'
+        # if months name contain a dot, it needs to be escaped.
+        escaped_months = [re.escape(_) for _ in self.origNames2monthNum]
+        monthR = r'(?P<month>(%s))' % u'|'.join(escaped_months)
+        dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9]))\.?'
 
         self.ptimeR = re.compile(timeR)
-        self.timeznR = re.compile(timeznR)
-        self.yearR = re.compile(yearR)
+        self.ptimeznR = re.compile(timeznR)
+        self.pyearR = re.compile(yearR)
         self.pmonthR = re.compile(monthR, re.U)
         self.pdayR = re.compile(dayR)
 
         # order is important to avoid mismatch when searching
         self.patterns = [
             self.ptimeR,
-            self.timeznR,
-            self.yearR,
+            self.ptimeznR,
+            self.pyearR,
             self.pmonthR,
             self.pdayR,
         ]
@@ -1218,12 +1222,21 @@
         Take the rightmost match, to prevent spurious earlier matches, and 
replace with marker
         """
         m = None
+        cnt = 0
         for m in pat.finditer(txt):
-            pass
+            cnt += 1
 
         if m:
             marker = self.findmarker(txt)
-            txt = pat.sub(marker, txt)
+            # month and day format might be identical (e.g. see bug 69315),
+            # avoid to wipe out day, after month is matched.
+            # replace all matches but the one before last, which is the day 
candidate.
+            if pat == self.pmonthR:
+                txt = pat.sub(marker, txt, cnt - 2)
+                # matched month needs to be wiped out (last match of txt)
+                txt = re.sub(r'(.*)%s' % m.group(), r'\1%s' % marker, txt)
+            else:
+                txt = pat.sub(marker, txt)
             return (txt, m.groupdict())
         else:
             return (txt, None)
@@ -1241,7 +1254,6 @@
             line, matchDict = self.last_match_and_replace(line, pat)
             if matchDict:
                 dateDict.update(matchDict)
-
         # all fields matched -> date valid
         if all(g in dateDict for g in self.groups):
             # remove 'time' key, now splitted in hour/minute and not needed by 
datetime
@@ -1251,14 +1263,19 @@
             try:
                 dateDict['month'] = self.origNames2monthNum[dateDict['month']]
             except KeyError:
-                pywikibot.output(u'incorrect month name in page')
+                pywikibot.output(u'incorrect month name "%s" in page in site 
%s'
+                                 % (dateDict['month'], self.site))
+                raise KeyError
 
             # convert to integers
             for k, v in dateDict.items():
+                if k == 'tzinfo':
+                    continue
                 try:
                     dateDict[k] = int(v)
                 except ValueError:
-                    pass
+                    raise ValueError('Value: %s could not be converted for 
key: %s.'
+                                     % (v, k))
 
             # find timezone
             dateDict['tzinfo'] = 
tzoneFixedOffset(self.site.siteinfo['timeoffset'],
diff --git a/tests/archivebot_tests.py b/tests/archivebot_tests.py
index fa1d915..8539ef3 100644
--- a/tests/archivebot_tests.py
+++ b/tests/archivebot_tests.py
@@ -73,9 +73,14 @@
         for code in THREADS:
             test_name = "test_wikipedia_" + code
 
-            if code in ['ar', 'ckb', 'en', 'fa', 'frr', 'no', 'pdc', 'pt', 
'th',
-                        'ug']:
+            if code in ['ar', 'ckb', 'fa', 'pdc', 'th']:
                 # expected failures - should be fixed
+                # 'ar', 'ckb', 'fa': no digits in date, regex does not match
+                # 'pdc': changed month name setting in wiki over time (?)
+                #   in old posts in talk page, February is "Feb.", site 
message gives
+                #   <message name="feb" xml:space="preserve">Han.</message>.
+                #   for new entries it should work
+                # 'th': year is 2552 while regex assumes 19..|20.., might be 
fixed
                 dct[test_name] = unittest.expectedFailure(test_method(code))
             else:
                 dct[test_name] = test_method(code)
diff --git a/tests/timestripper_tests.py b/tests/timestripper_tests.py
index 79a1878..1b3f171 100644
--- a/tests/timestripper_tests.py
+++ b/tests/timestripper_tests.py
@@ -36,7 +36,7 @@
 
         txtWithMatch = u'this string has one 1998, 1999 and 3000 in it'
         txtWithNoMatch = u'this string has no match'
-        pat = self.ts.yearR
+        pat = self.ts.pyearR
 
         self.assertEqual(self.ts.last_match_and_replace(txtWithMatch, pat),
                          (u'this string has one @@, @@ and 3000 in it',
@@ -62,6 +62,98 @@
         self.assertEqual(self.ts.timestripper(txtNoMatch), None)
 
 
+class TestEnglishTimeStripper(PywikibotTestCase):
+    """Test cases for Link objects"""
+
+    def setUp(self):
+        site = pywikibot.Site('en', 'wikipedia')
+        self.ts = TimeStripper(site)
+        super(TestEnglishTimeStripper, self).setUp()
+
+    def test_timestripper(self):
+        """Test that correct date is matched"""
+
+        txtMatch = u'3 February 2010 19:48 (UTC) 7 February 2010 19:48 (UTC)'
+        txtNoMatch = u'3. 2. 2010, 19:48 (UTC) 7. 2. 2010 19:48 (UTC)'
+
+        tzone = tzoneFixedOffset(self.ts.site.siteinfo['timeoffset'],
+                                 self.ts.site.siteinfo['timezone'])
+
+        res = datetime.datetime(2010, 2, 7, 19, 48, tzinfo=tzone)
+
+        self.assertEqual(self.ts.timestripper(txtMatch), res)
+        self.assertEqual(self.ts.timestripper(txtNoMatch), None)
+
+
+class TestCzechTimeStripper(PywikibotTestCase):
+    """Test cases for Link objects"""
+
+    def setUp(self):
+        site = pywikibot.Site('cs', 'wikipedia')
+        self.ts = TimeStripper(site)
+        super(TestCzechTimeStripper, self).setUp()
+
+    def test_timestripper(self):
+        """Test that correct date is matched"""
+
+        txtMatch = u'3. 2. 2010, 19:48 (UTC) 7. 2. 2010 19:48 (UTC)'
+        txtNoMatch = u'3 March 2010 19:48 (UTC) 7 March 2010 19:48 (UTC)'
+
+        tzone = tzoneFixedOffset(self.ts.site.siteinfo['timeoffset'],
+                                 self.ts.site.siteinfo['timezone'])
+
+        res = datetime.datetime(2010, 2, 7, 19, 48, tzinfo=tzone)
+
+        self.assertEqual(self.ts.timestripper(txtMatch), res)
+        self.assertEqual(self.ts.timestripper(txtNoMatch), None)
+
+
+class TestPortugueseTimeStripper(PywikibotTestCase):
+    """Test cases for Link objects"""
+
+    def setUp(self):
+        site = pywikibot.Site('pt', 'wikipedia')
+        self.ts = TimeStripper(site)
+        super(TestPortugueseTimeStripper, self).setUp()
+
+    def test_timestripper(self):
+        """Test that correct date is matched"""
+
+        txtMatch = u'19h48min de 3 de fevereiro de 2010‎ (UTC) 19h48min de 7 
de fevereiro de 2010‎ (UTC)'
+        txtNoMatch = u'3 March 2010 19:48 (UTC) 7 March 2010 19:48 (UTC)'
+
+        tzone = tzoneFixedOffset(self.ts.site.siteinfo['timeoffset'],
+                                 self.ts.site.siteinfo['timezone'])
+
+        res = datetime.datetime(2010, 2, 7, 19, 48, tzinfo=tzone)
+
+        self.assertEqual(self.ts.timestripper(txtMatch), res)
+        self.assertEqual(self.ts.timestripper(txtNoMatch), None)
+
+
+class TestNorwegianTimeStripper(PywikibotTestCase):
+    """Test cases for Link objects"""
+
+    def setUp(self):
+        site = pywikibot.Site('no', 'wikipedia')
+        self.ts = TimeStripper(site)
+        super(TestNorwegianTimeStripper, self).setUp()
+
+    def test_timestripper(self):
+        """Test that correct date is matched"""
+
+        txtMatch = u'3. feb 2010 kl. 19:48 (CET) 7. feb 2010 kl. 19:48 (UTC)'
+        txtNoMatch = u'3 March 2010 19:48 (UTC) 7 March 2010 19:48 (UTC)'
+
+        tzone = tzoneFixedOffset(self.ts.site.siteinfo['timeoffset'],
+                                 self.ts.site.siteinfo['timezone'])
+
+        res = datetime.datetime(2010, 2, 7, 19, 48, tzinfo=tzone)
+
+        self.assertEqual(self.ts.timestripper(txtMatch), res)
+        self.assertEqual(self.ts.timestripper(txtNoMatch), None)
+
+
 if __name__ == '__main__':
     try:
         unittest.main()

-- 
To view, visit https://gerrit.wikimedia.org/r/153338
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ifee1041cf9762f419c48fc6cb0faa56b84d0bee4
Gerrit-PatchSet: 11
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <[email protected]>
Gerrit-Reviewer: John Vandenberg <[email protected]>
Gerrit-Reviewer: Ladsgroup <[email protected]>
Gerrit-Reviewer: Merlijn van Deen <[email protected]>
Gerrit-Reviewer: Mpaa <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits

Reply via email to