jenkins-bot has submitted this change and it was merged.
Change subject: Bug 69315 - cs timestamp not supported
......................................................................
Bug 69315 - cs timestamp not supported
cswiki uses the 21. 5. 2014, 17:07 (UTC) timestamp.
Once month is matched, days from 1-12 are wiped out by
last_match_and_replace().
Changed last_match_and_replace() to replace only the first N-2 and
the last match, leaving the N-1 as day candidate.
Modified also:
- fixed a bug in monthR regex: if month name contains a dot, it needs to
be escaped. It happened e.g. on wikipedia:no.
Also raise Keyerror exception that was masking this bug.
- regex for day to include the dot (not mandatory but
improves robustness of the search).
- aligned names of self.timeznR->ptimeznR and self.yearR->pyearR for
consistency with naming convention.
- fixed a pep8 error at line 859
Change-Id: Ifee1041cf9762f419c48fc6cb0faa56b84d0bee4
---
M pywikibot/textlib.py
M tests/archivebot_tests.py
M tests/timestripper_tests.py
3 files changed, 129 insertions(+), 15 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index d0ac855..0b56f68 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -856,7 +856,7 @@
else:
sep = config.line_separator
# Some people don't like the categories sorted
- #catLinks.sort()
+ # catLinks.sort()
return sep.join(catLinks) + config.line_separator
@@ -1181,6 +1181,8 @@
for n, (_long, _short) in enumerate(self.site.months_names, start=1):
self.origNames2monthNum[_long] = n
self.origNames2monthNum[_short] = n
+ # in some cases month in ~~~~ might end without dot even if
+ # site.months_names do not.
if _short.endswith('.'):
self.origNames2monthNum[_short[:-1]] = n
@@ -1189,20 +1191,22 @@
timeR = r'(?P<time>(?P<hour>[0-2]\d)[:\.h](?P<minute>[0-5]\d))'
timeznR = r'\((?P<tzinfo>[A-Z]+)\)'
yearR = r'(?P<year>(19|20)\d\d)'
- monthR = r'(?P<month>(%s))' % (u'|'.join(self.origNames2monthNum))
- dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9]))'
+ # if months name contain a dot, it needs to be escaped.
+ escaped_months = [re.escape(_) for _ in self.origNames2monthNum]
+ monthR = r'(?P<month>(%s))' % u'|'.join(escaped_months)
+ dayR = r'(?P<day>(3[01]|[12]\d|0?[1-9]))\.?'
self.ptimeR = re.compile(timeR)
- self.timeznR = re.compile(timeznR)
- self.yearR = re.compile(yearR)
+ self.ptimeznR = re.compile(timeznR)
+ self.pyearR = re.compile(yearR)
self.pmonthR = re.compile(monthR, re.U)
self.pdayR = re.compile(dayR)
# order is important to avoid mismatch when searching
self.patterns = [
self.ptimeR,
- self.timeznR,
- self.yearR,
+ self.ptimeznR,
+ self.pyearR,
self.pmonthR,
self.pdayR,
]
@@ -1218,12 +1222,21 @@
Take the rightmost match, to prevent spurious earlier matches, and
replace with marker
"""
m = None
+ cnt = 0
for m in pat.finditer(txt):
- pass
+ cnt += 1
if m:
marker = self.findmarker(txt)
- txt = pat.sub(marker, txt)
+ # month and day format might be identical (e.g. see bug 69315),
+ # avoid to wipe out day, after month is matched.
+ # replace all matches but the one before last, which is the day
candidate.
+ if pat == self.pmonthR:
+ txt = pat.sub(marker, txt, cnt - 2)
+ # matched month needs to be wiped out (last match of txt)
+ txt = re.sub(r'(.*)%s' % m.group(), r'\1%s' % marker, txt)
+ else:
+ txt = pat.sub(marker, txt)
return (txt, m.groupdict())
else:
return (txt, None)
@@ -1241,7 +1254,6 @@
line, matchDict = self.last_match_and_replace(line, pat)
if matchDict:
dateDict.update(matchDict)
-
# all fields matched -> date valid
if all(g in dateDict for g in self.groups):
# remove 'time' key, now splitted in hour/minute and not needed by
datetime
@@ -1251,14 +1263,19 @@
try:
dateDict['month'] = self.origNames2monthNum[dateDict['month']]
except KeyError:
- pywikibot.output(u'incorrect month name in page')
+ pywikibot.output(u'incorrect month name "%s" in page in site
%s'
+ % (dateDict['month'], self.site))
+ raise KeyError
# convert to integers
for k, v in dateDict.items():
+ if k == 'tzinfo':
+ continue
try:
dateDict[k] = int(v)
except ValueError:
- pass
+ raise ValueError('Value: %s could not be converted for
key: %s.'
+ % (v, k))
# find timezone
dateDict['tzinfo'] =
tzoneFixedOffset(self.site.siteinfo['timeoffset'],
diff --git a/tests/archivebot_tests.py b/tests/archivebot_tests.py
index fa1d915..8539ef3 100644
--- a/tests/archivebot_tests.py
+++ b/tests/archivebot_tests.py
@@ -73,9 +73,14 @@
for code in THREADS:
test_name = "test_wikipedia_" + code
- if code in ['ar', 'ckb', 'en', 'fa', 'frr', 'no', 'pdc', 'pt',
'th',
- 'ug']:
+ if code in ['ar', 'ckb', 'fa', 'pdc', 'th']:
# expected failures - should be fixed
+ # 'ar', 'ckb', 'fa': no digits in date, regex does not match
+ # 'pdc': changed month name setting in wiki over time (?)
+ # in old posts in talk page, February is "Feb.", site
message gives
+ # <message name="feb" xml:space="preserve">Han.</message>.
+ # for new entries it should work
+ # 'th': year is 2552 while regex assumes 19..|20.., might be
fixed
dct[test_name] = unittest.expectedFailure(test_method(code))
else:
dct[test_name] = test_method(code)
diff --git a/tests/timestripper_tests.py b/tests/timestripper_tests.py
index 79a1878..1b3f171 100644
--- a/tests/timestripper_tests.py
+++ b/tests/timestripper_tests.py
@@ -36,7 +36,7 @@
txtWithMatch = u'this string has one 1998, 1999 and 3000 in it'
txtWithNoMatch = u'this string has no match'
- pat = self.ts.yearR
+ pat = self.ts.pyearR
self.assertEqual(self.ts.last_match_and_replace(txtWithMatch, pat),
(u'this string has one @@, @@ and 3000 in it',
@@ -62,6 +62,98 @@
self.assertEqual(self.ts.timestripper(txtNoMatch), None)
+class TestEnglishTimeStripper(PywikibotTestCase):
+ """Test cases for Link objects"""
+
+ def setUp(self):
+ site = pywikibot.Site('en', 'wikipedia')
+ self.ts = TimeStripper(site)
+ super(TestEnglishTimeStripper, self).setUp()
+
+ def test_timestripper(self):
+ """Test that correct date is matched"""
+
+ txtMatch = u'3 February 2010 19:48 (UTC) 7 February 2010 19:48 (UTC)'
+ txtNoMatch = u'3. 2. 2010, 19:48 (UTC) 7. 2. 2010 19:48 (UTC)'
+
+ tzone = tzoneFixedOffset(self.ts.site.siteinfo['timeoffset'],
+ self.ts.site.siteinfo['timezone'])
+
+ res = datetime.datetime(2010, 2, 7, 19, 48, tzinfo=tzone)
+
+ self.assertEqual(self.ts.timestripper(txtMatch), res)
+ self.assertEqual(self.ts.timestripper(txtNoMatch), None)
+
+
+class TestCzechTimeStripper(PywikibotTestCase):
+ """Test cases for Link objects"""
+
+ def setUp(self):
+ site = pywikibot.Site('cs', 'wikipedia')
+ self.ts = TimeStripper(site)
+ super(TestCzechTimeStripper, self).setUp()
+
+ def test_timestripper(self):
+ """Test that correct date is matched"""
+
+ txtMatch = u'3. 2. 2010, 19:48 (UTC) 7. 2. 2010 19:48 (UTC)'
+ txtNoMatch = u'3 March 2010 19:48 (UTC) 7 March 2010 19:48 (UTC)'
+
+ tzone = tzoneFixedOffset(self.ts.site.siteinfo['timeoffset'],
+ self.ts.site.siteinfo['timezone'])
+
+ res = datetime.datetime(2010, 2, 7, 19, 48, tzinfo=tzone)
+
+ self.assertEqual(self.ts.timestripper(txtMatch), res)
+ self.assertEqual(self.ts.timestripper(txtNoMatch), None)
+
+
+class TestPortugueseTimeStripper(PywikibotTestCase):
+ """Test cases for Link objects"""
+
+ def setUp(self):
+ site = pywikibot.Site('pt', 'wikipedia')
+ self.ts = TimeStripper(site)
+ super(TestPortugueseTimeStripper, self).setUp()
+
+ def test_timestripper(self):
+ """Test that correct date is matched"""
+
+ txtMatch = u'19h48min de 3 de fevereiro de 2010 (UTC) 19h48min de 7
de fevereiro de 2010 (UTC)'
+ txtNoMatch = u'3 March 2010 19:48 (UTC) 7 March 2010 19:48 (UTC)'
+
+ tzone = tzoneFixedOffset(self.ts.site.siteinfo['timeoffset'],
+ self.ts.site.siteinfo['timezone'])
+
+ res = datetime.datetime(2010, 2, 7, 19, 48, tzinfo=tzone)
+
+ self.assertEqual(self.ts.timestripper(txtMatch), res)
+ self.assertEqual(self.ts.timestripper(txtNoMatch), None)
+
+
+class TestNorwegianTimeStripper(PywikibotTestCase):
+ """Test cases for Link objects"""
+
+ def setUp(self):
+ site = pywikibot.Site('no', 'wikipedia')
+ self.ts = TimeStripper(site)
+ super(TestNorwegianTimeStripper, self).setUp()
+
+ def test_timestripper(self):
+ """Test that correct date is matched"""
+
+ txtMatch = u'3. feb 2010 kl. 19:48 (CET) 7. feb 2010 kl. 19:48 (UTC)'
+ txtNoMatch = u'3 March 2010 19:48 (UTC) 7 March 2010 19:48 (UTC)'
+
+ tzone = tzoneFixedOffset(self.ts.site.siteinfo['timeoffset'],
+ self.ts.site.siteinfo['timezone'])
+
+ res = datetime.datetime(2010, 2, 7, 19, 48, tzinfo=tzone)
+
+ self.assertEqual(self.ts.timestripper(txtMatch), res)
+ self.assertEqual(self.ts.timestripper(txtNoMatch), None)
+
+
if __name__ == '__main__':
try:
unittest.main()
--
To view, visit https://gerrit.wikimedia.org/r/153338
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ifee1041cf9762f419c48fc6cb0faa56b84d0bee4
Gerrit-PatchSet: 11
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <[email protected]>
Gerrit-Reviewer: John Vandenberg <[email protected]>
Gerrit-Reviewer: Ladsgroup <[email protected]>
Gerrit-Reviewer: Merlijn van Deen <[email protected]>
Gerrit-Reviewer: Mpaa <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
Pywikibot-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikibot-commits