Dachary has uploaded a new change for review. https://gerrit.wikimedia.org/r/316059
Change subject: license: import licenses from wikipedia ...................................................................... license: import licenses from wikipedia Change-Id: Ia9f52d1a1cf60de4a2d098cfecf6e056b3ca24ba Signed-off-by: Loic Dachary <l...@dachary.org> --- M FLOSSbot/bot.py A FLOSSbot/license.py A tests/test_license.py 3 files changed, 429 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/bots/FLOSSbot refs/changes/59/316059/1 diff --git a/FLOSSbot/bot.py b/FLOSSbot/bot.py index 8642759..6962b54 100644 --- a/FLOSSbot/bot.py +++ b/FLOSSbot/bot.py @@ -22,7 +22,7 @@ import pywikibot from pywikibot import pagegenerators as pg -from FLOSSbot import fsd, qa, repository +from FLOSSbot import fsd, license, qa, repository from FLOSSbot.plugin import Plugin logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s') @@ -32,6 +32,7 @@ repository.Repository, qa.QA, fsd.FSD, + license.License, ] name2plugin = dict([(p.__name__, p) for p in plugins]) diff --git a/FLOSSbot/license.py b/FLOSSbot/license.py new file mode 100644 index 0000000..7f8fd09 --- /dev/null +++ b/FLOSSbot/license.py @@ -0,0 +1,291 @@ +# +# Copyright (C) 2016 Loic Dachary <l...@dachary.org> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +import argparse +import logging +import re + +import pywikibot +from pywikibot import pagegenerators as pg + +from FLOSSbot import plugin + +log = logging.getLogger(__name__) + + +class License(plugin.Plugin): + + def __init__(self, *args): + super(License, self).__init__(*args) + self.license2item = None + self.licenses = None + + @staticmethod + def get_parser(): + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument( + '--license', + action='append', + default=[], + help='only consider this license (can be repeated)') + return parser + + @staticmethod + def filter_names(): + return ['license-verify', 'no-license'] + + def get_query(self, filter): + format_args = { + 'license': self.P_license, + 'subclass_of': self.P_subclass_of, + 'instance_of': self.P_instance_of, + 'open_source': self.Q_open_source_license.getID(), + 'free_software': self.Q_free_software_license.getID(), + 'retrieved': self.P_retrieved, + 'delay': self.args.verification_delay, + } + if filter == 'license-verify': + query = """ + SELECT DISTINCT ?item WHERE {{ + {{ + ?item p:{license} ?license . + ?license ps:{license}/wdt:{instance_of}?/wdt:{subclass_of}* + wd:{open_source}. + }} Union {{ + ?item p:{license} ?license . + ?license ps:{license}/wdt:{instance_of}?/wdt:{subclass_of}* + wd:{free_software}. + }} + OPTIONAL {{ + ?license prov:wasDerivedFrom/ + <http://www.wikidata.org/prop/reference/{retrieved}> + ?retrieved + }} + FILTER (!BOUND(?retrieved) || + ?retrieved < (now() - "P{delay}D"^^xsd:duration)) + }} ORDER BY ?item + """.format(**format_args) + elif filter == 'no-license': + format_args.update({ + 'foss': self.Q_free_and_open_source_software.getID(), + 'free_software': self.Q_free_software.getID(), + 'open_source_software': self.Q_open_source_software.getID(), + 'public_domain': self.Q_public_domain.getID(), + 'software': self.Q_software.getID(), + }) + query = """ + SELECT DISTINCT ?item WHERE {{ + {{ + ?item p:{instance_of}/ps:{instance_of}/wdt:{subclass_of}* + wd:{foss}. + }} Union {{ + ?item p:{instance_of}/ps:{instance_of}/wdt:{subclass_of}* + wd:{free_software}. + }} Union {{ + ?item p:{instance_of}/ps:{instance_of}/wdt:{subclass_of}* + wd:{open_source_software}. + }} Union {{ + ?item p:{instance_of}/ps:{instance_of}/wdt:{subclass_of}* + wd:{public_domain}. + ?item p:{instance_of}/ps:{instance_of}/wdt:{subclass_of}* + wd:{software}. + }} + FILTER NOT EXISTS {{ ?item p:{license} ?license }} + }} ORDER BY ?item + """.format(**format_args) + else: + query = None + return query + + def run(self, item): + self.fixup(item) + self.verify(item) + + def verify(self, item): + item.get() + self.debug(item, " verifying") + + def get_names(self, lang): + if self.license2item is None: + self.set_license2item() + self.licenses = { + 'en': { + 'names': dict([ + (l, l) for (l, i) in self.license2item.items() + ]), + }, + } + self.set_redirects('en') + if lang not in self.licenses: + self.set_names(lang) + self.set_redirects(lang) + licenses = self.licenses[lang] + return (list(licenses['names'].keys()) + + list(licenses['redirects'].keys())) + + def set_redirects(self, lang): + redirects = {} + for name in self.licenses[lang]['names'].keys(): + redirects[name] = name + for redirect in self.get_redirects(name, lang): + redirects[redirect] = name + self.licenses[lang]['redirects'] = redirects + + def set_names(self, lang): + lang2en = {} + self.licenses[lang] = {'names': lang2en} + for english in self.licenses['en']['names'].keys(): + title = self.translate_title(english, lang) + if title is not None: + log.debug("set_names " + lang + " " + english + " => " + title) + lang2en[title] = english + + def get_item(self, license, lang): + licenses = self.licenses[lang] + canonical = licenses['redirects'][license] + english = licenses['names'][canonical] + return self.license2item[english] + + def set_dbname2item(self): + query = """ + SELECT DISTINCT ?item WHERE {{ + ?item wdt:{dbname} ?dbname. + }} + """.format(dbname=self.P_Wikimedia_database_name) + log.debug("set_dbname2item " + query) + self.license2item = {} + enwiki = pywikibot.site.APISite.fromDBName('enwiki') + for item in pg.WikidataSPARQLPageGenerator(query, + site=self.bot.site, + result_type=list): + item.get() + log.debug("set_dbname2item " + item.title() + + " " + str(item.labels.get('en'))) + if 'enwiki' not in item.sitelinks: + log.debug('ignore ' + item.title() + + " because it does not link to enwiki") + continue + p = pywikibot.Page(enwiki, item.sitelinks['enwiki']) + self.license2item[p.title()] = item + + def set_license2item(self): + format_args = { + 'subclass_of': self.P_subclass_of, + 'instance_of': self.P_instance_of, + 'open_source': self.Q_open_source_license.getID(), + 'free_software': self.Q_free_software_license.getID(), + 'licenses': '', + } + if self.args.license: + licenses = [] + for license in self.args.license: + licenses.append("STR(?label) = '" + license + "'") + licenses = ('?item rdfs:label ?label FILTER(' + + " || ".join(licenses) + ")") + format_args['licenses'] = licenses + query = """ + SELECT DISTINCT ?item WHERE {{ + {{ + ?item wdt:{instance_of}?/wdt:{subclass_of}* wd:{open_source}. + }} Union {{ + ?item wdt:{instance_of}?/wdt:{subclass_of}* wd:{free_software}. + }} + {licenses} + }} + """.format(**format_args) + log.debug("set_license2item " + query) + self.license2item = {} + enwiki = pywikibot.site.APISite.fromDBName('enwiki') + for item in pg.WikidataSPARQLPageGenerator(query, + site=self.bot.site, + result_type=list): + item.get() + log.debug("set_license2item " + item.title() + + " " + str(item.labels.get('en'))) + if 'enwiki' not in item.sitelinks: + log.debug('set_license2item ignore ' + item.title() + + " because it does not link to enwiki") + continue + p = pywikibot.Page(enwiki, item.sitelinks['enwiki']) + self.license2item[p.title()] = item + + def template_parse_license(self, license, lang): + free_software_licenses = self.get_names(lang) + results = set() + for name in (re.findall('\[\[([^|\]]+?)\]\]', license) + + re.findall('\[\[([^|\]]+?)\|[^\]]*\]\]', license)): + log.debug("template_parse_license: " + name) + if name in free_software_licenses: + results.add(self.get_item(name, lang)) + return list(results) + + def fixup(self, item): + item.get() + + if self.P_license in item.claims: + return ['exists'] + + lang2field = { + 'ca': 'Llicència', + 'en': 'License', + 'ja': 'license', + 'ml': 'license', + 'ru': 'license', + 'zh': 'license', + } + lang2template = { + 'ca': 'Caixa Programari', + 'es': 'Ficha de software', + 'en': 'Infobox', + 'it': 'Software', + 'pt': 'Info/Software', + 'ru': 'Карточка программы', + '*': 'Infobox', + } + lang2value = {} + for (lang, license) in self.get_template_field( + item, lang2field, lang2template).items(): + lang2value[lang] = self.template_parse_license(license, lang) + if len(lang2value) == 0: + return ['nothing'] + self.debug(item, "fixup " + str(lang2value)) + values = list(lang2value.values()) + # if one wikipedia disagrees with the others, do nothing + if values.count(values[0]) != len(values): + self.error(item, + "inconsistent license information between wikipedia" + + str(lang2value)) + return ['inconsistent'] + status = [] + for license in lang2value[list(lang2value.keys())[0]]: + license.get() + langs = list(lang2value.keys()) + self.info(item, "ADD license " + license.labels['en'] + + " from " + str(langs)) + status.append(license.labels['en']) + claim = pywikibot.Claim(self.bot.site, self.P_license, 0) + claim.setTarget(license) + if not self.args.dry_run: + item.addClaim(claim) + for lang in langs: + imported = pywikibot.Claim(self.bot.site, + self.P_imported_from, + isReference=True) + imported.setTarget(self.get_sitelink_item(lang + "wiki")) + if not self.args.dry_run: + claim.addSource(imported) + self.set_retrieved(item, claim) + return status diff --git a/tests/test_license.py b/tests/test_license.py new file mode 100644 index 0000000..62de15a --- /dev/null +++ b/tests/test_license.py @@ -0,0 +1,136 @@ +# -*- mode: python; coding: utf-8 -*- +# +# Copyright (C) 2016 Loic Dachary <l...@dachary.org> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +import logging + +import mock +import pywikibot + +from FLOSSbot.bot import Bot +from FLOSSbot.license import License +from tests.wikidata import WikidataHelper + +log = logging.getLogger('FLOSSbot') + + +class TestLicense(object): + + def setup_class(self): + WikidataHelper().login() + + def setup(self): + self.gpl = 'GNU General Public License' + self.mit = 'MIT license' + self.args = [ + '--license', self.gpl, + '--license', self.mit, + ] + + def test_get_item(self): + bot = Bot.factory(['--verbose'] + self.args) + license = License(bot, bot.args) + redirect = 'GPL' + license.get_names('en') + canonical_item = license.get_item(self.gpl, 'en') + assert canonical_item == license.get_item(redirect, 'en') + gpl_fr = 'Licence publique générale GNU' + names_fr = license.get_names('fr') + assert gpl_fr in names_fr + assert canonical_item == license.get_item(gpl_fr, 'fr') + + def test_get_names(self): + bot = Bot.factory(['--verbose'] + self.args) + license = License(bot, bot.args) + redirect = 'GPL' + names = license.get_names('en') + assert self.gpl in names + assert redirect in names + + canonical_fr = 'Licence publique générale GNU' + names = license.get_names('fr') + assert canonical_fr in names + assert self.gpl in names + + def test_template_parse_license(self): + bot = Bot.factory(['--verbose'] + self.args) + license = License(bot, bot.args) + found = license.template_parse_license('[[GPL]] [[MIT|]]', 'en') + for item in found: + item.get() + license.debug(item, "FOUND") + assert item.labels['en'] in (self.gpl, self.mit) + + @mock.patch('FLOSSbot.license.License.set_license2item') + @mock.patch('FLOSSbot.plugin.Plugin.get_sitelink_item') + def test_fixup(self, m_get_sitelink_item, m_set_license2item): + bot = Bot.factory([ + '--verbose', + '--test', + '--user=FLOSSbotCI', + ]) + l = License(bot, bot.args) + + gpl = l.Q_GNU_General_Public_License + gpl.get() + found = False + if gpl.claims: + for claim in gpl.claims.get(l.P_subclass_of, []): + if claim.type != 'wikibase-item': + continue + if (claim.getTarget().getID() == + l.Q_free_software_license.getID()): + found = True + break + if not found: + subclass_of = pywikibot.Claim(l.bot.site, l.P_subclass_of, 0) + subclass_of.setTarget(l.Q_free_software_license) + gpl.addClaim(subclass_of) + gpl.setSitelink({'site': 'enwiki', 'title': self.gpl}) + gpl.get(force=True) + + emacs = l.Q_GNU_Emacs + emacs.get() + if emacs.claims: + licenses = emacs.claims.get(l.P_license, []) + if licenses: + emacs.removeClaims(licenses) + emacs.get(force=True) + + def set_license2item(): + l.license2item = {self.gpl: l.Q_GNU_General_Public_License} + m_set_license2item.side_effect = set_license2item + + def get_sitelink_item(dbname): + if dbname == 'enwiki': + return l.Q_English_Wikipedia + elif dbname == 'frwiki': + return l.Q_French_Wikipedia + else: + assert 0, "unexpected " + dbname + m_get_sitelink_item.side_effect = get_sitelink_item + emacs.removeSitelinks(['enwiki']) + emacs.removeSitelinks(['frwiki']) + emacs.get(force=True) + assert ['nothing'] == l.fixup(emacs) + emacs.setSitelink({'site': 'enwiki', 'title': 'GNU Emacs'}) + emacs.setSitelink({'site': 'frwiki', 'title': 'GNU Emacs'}) + emacs.get(force=True) + assert [self.gpl] == l.fixup(emacs) + +# Local Variables: +# compile-command: "cd .. ; tox -e py3 tests/test_license.py" +# End: -- To view, visit https://gerrit.wikimedia.org/r/316059 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ia9f52d1a1cf60de4a2d098cfecf6e056b3ca24ba Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/bots/FLOSSbot Gerrit-Branch: master Gerrit-Owner: Dachary <l...@dachary.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits