http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11433
Revision: 11433
Author: drtrigon
Date: 2013-04-22 18:40:18 +0000 (Mon, 22 Apr 2013)
Log Message:
-----------
bug fix; wikidata recognition for unchanged data
improvement; wikidata template page format further refined
Modified Paths:
--------------
trunk/pywikipedia/subster.py
Modified: trunk/pywikipedia/subster.py
===================================================================
--- trunk/pywikipedia/subster.py 2013-04-22 18:10:03 UTC (rev 11432)
+++ trunk/pywikipedia/subster.py 2013-04-22 18:40:18 UTC (rev 11433)
@@ -527,24 +527,35 @@
def data_convertContent(self, substed_content):
"""Converts the substed content to Wikidata format in order to save.
- (1 line of wiki text is converted to 1 claim/statement)
- @param substed_content: New content (with tags).
+ Template page format:
+ <pre>
+ | key1 = value1
+ | key2 = value2
+ ...
+ </pre>
+ (1 line of wiki text is converted to 1 claim/statement, the lines
+ have to be embedded into pre-tags and start with '|')
+
+ @param substed_content: New/Changed content (including tags).
@type substed_content: string
+
+ Returns the extracted and converted data.
"""
# DRTRIGON-130: convert talk page result to wikidata(base)
- # TODO: consider format; every line starting with "|" is data
- # TODO: combine with 'outputContentDiff' in order to update changed
only
+ data = u'\n'.join(re.findall('<pre>(.*?)</pre>', substed_content,
+ re.S | re.I))
res = {}
- for line in substed_content.splitlines():
- #data = self.get_var_regex('(.*?)', '(.*?)').findall(line)
- data = self.get_var_regex('.*?', '(.*?)').sub('\g<1>', line)
- #if not data:
- if data == line:
+ for line in data.splitlines():
+ #line = self.get_var_regex('(.*?)', '(.*?)').findall(line)
+ line = self.get_var_regex('.*?', '(.*?)').sub('\g<1>', line)
+ line = line.strip()
+ if (not line) or (line[0] != u'|'):
continue
- data = data.lstrip(u'|')
- key, value = data.split(u'=')
- res[key.strip()] = value.strip()
+ line = line.lstrip(u'|').split(u'=', 1)
+ if len(line) != 2:
+ continue
+ res[line[0].strip()] = line[1].strip()
return res
@@ -562,14 +573,14 @@
datapage = pywikibot.DataPage(self.site, page.title())
links = datapage.searchentities(u'%s:%s' %
(self._bot_config['BotName'], datapage.title().split(u':')[1]))
for element in links:
- propid = self._bot_config['data_PropertyId']
+ propid = int(self._bot_config['data_PropertyId'])
el = element[u'aliases'][0].split(u':')
item = el[2]
if item not in data:
pywikibot.output(u'Value "%s" not found.' % (item,))
data[item] = u'%s: N/A' % self._bot_config['BotName']
if len(el) > 3:
- propid = el[3]
+ propid = int(el[3])
dataoutpage = pywikibot.DataPage(self.site, element['id'])
@@ -579,8 +590,8 @@
claim = [ claim for claim in buf[u'claims'] if (claim['m'][1] ==
propid) ]
# TODO: does this check (if) work with multiple claims per
property?
if (not claim) or (claim[0]['m'][3] != data[item]):
- pywikibot.output(u'%s in %s <--- %s = %s' %\
- (element[u'aliases'][0], dataoutpage.title(asLink=True),
item, data[item]))
+ pywikibot.output(u'%s in %s changed to "%s"' %\
+ (element[u'aliases'][0], dataoutpage.title(asLink=True),
data[item]))
dataoutpage.editclaim(u'p%s' % propid, data[item],
refs={"p%s" % propid:
[{"snaktype": "value",
_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn