Lokal Profil has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/390169 )
Change subject: Make statistics table for unknown fields ...................................................................... Make statistics table for unknown fields Also add header to each report page. Change-Id: I89e5e2e4adcbfe31139fb111a436c6240d0a6c62 --- M erfgoedbot/update_database.py M tests/test_update_database.py 2 files changed, 580 insertions(+), 19 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/labs/tools/heritage refs/changes/69/390169/1 diff --git a/erfgoedbot/update_database.py b/erfgoedbot/update_database.py index fc0fceb..2086046 100755 --- a/erfgoedbot/update_database.py +++ b/erfgoedbot/update_database.py @@ -140,30 +140,59 @@ countryconfig.get('table'), field.get('conv'))) -def unknownFieldsStatistics(countryconfig, unknownFields): +def unknownFieldsStatistics(countryconfig, unknown_fields): """ Outputs a list of any unknown fields as a wikitext table. The table contains the name and frequency of the field and a sample of source pages where this field was encountered. + + @param countryconfig: the configurations for the dataset being processed. + @param unknown_fields: dict of discovered fields with each value being a + Counter for how frequently the field is encountered per page. + @return: dict summarising the usages """ site = pywikibot.Site(u'commons', u'commons') page = pywikibot.Page( site, u'Commons:Monuments database/Unknown fields/{0}'.format( countryconfig.get('table'))) - summary = u'Updating the list of unknown fields' + summary = u'Updating the list of unknown fields with {0} entries' - text = u'{| class="wikitable sortable"\n' - text += u'! Field !! Count !! Sources\n' - for key, counter in unknownFields.items(): - text += u'|-\n' - text += u'| {0} || {1} || {2}\n'.format( - key, sum(counter.values()), format_source_field(counter, site)) + text = ( + u'{{#ifexist:{{FULLPAGENAME}}/header' + u'|{{/header}}' + u'|For information on how to use this report and how to localise ' + u'these instructions visit ' + u'[[:c:Commons:Monuments_database/Unknown fields]]. }}\n') - text += u'|}\n' + total_usages = 0 + pages_with_fields = set() + + if not unknown_fields: + text += u'\nThere are no unknown fields left. Great work!\n' + else: + text += u'{| class="wikitable sortable"\n' + text += u'! Field !! Count !! Sources\n' + for key, counter in unknown_fields.iteritems(): + total_usages += sum(counter.values()) + pages_with_fields.update(counter.keys()) + text += u'|-\n' + text += u'| {0} || {1} || {2}\n'.format( + key, sum(counter.values()), format_source_field(counter, site)) + text += u'|}\n' + text += u'[[Category:Commons:Monuments database/Unknown fields]]' - common.save_to_wiki_or_local(page, summary, text) + common.save_to_wiki_or_local( + page, summary.format(len(unknown_fields)), text) + + return { + 'report_page': page, + 'config': countryconfig, + 'total_fields': len(unknown_fields), + 'total_pages': len(pages_with_fields), + 'total_usages': total_usages + } def format_source_field(sources, site, sample_size=4): @@ -445,7 +474,8 @@ if countryconfig.get('type') == 'sparql': process_country_wikidata(countryconfig, conn, cursor) else: - process_country_list(countryconfig, conn, cursor, fullUpdate, daysBack) + return process_country_list( + countryconfig, conn, cursor, fullUpdate, daysBack) def process_country_list(countryconfig, conn, cursor, fullUpdate, daysBack): @@ -484,7 +514,7 @@ page, page.permalink(percent_encoded=False), countryconfig, conn, cursor, unknownFields=unknownFields) - unknownFieldsStatistics(countryconfig, unknownFields) + return unknownFieldsStatistics(countryconfig, unknownFields) def load_wikidata_template_sparql(): @@ -522,6 +552,102 @@ for resultitem in query_result: process_monument_wikidata(resultitem, countryconfig, conn, cursor) + + +def make_statistics(statistics): + """Output the overall results for unknown fields as a nice wikitable.""" + site = pywikibot.Site('commons', 'commons') + page = pywikibot.Page( + site, u'Commons:Monuments database/Unknown fields/Statistics') + + text = ( + u'{| class="wikitable sortable"\n' + u'! country ' + u'!! lang ' + u'!! data-sort-type="number"|Total unknown fields ' + u'!! data-sort-type="number"|Total usage of unknown fields ' + u'!! data-sort-type="number"|Total pages containing unknown fields ' + u'!! Report page ' + u'!! Row template ' + u'!! Header template ' + u'\n') + + text_row = ( + u'|-\n' + u'| {code} \n' + u'| {lang} \n' + u'| {total_fields} \n' + u'| {total_usages} \n' + u'| {total_pages} \n' + u'| {report_page} \n' + u'| {row_template} \n' + u'| {header_template} \n') + + total_fields_sum = 0 + total_usages_sum = 0 + total_pages_sum = 0 + for row in statistics: + if not row: + # sparql harvests don't generate statistics + continue + countryconfig = row.get('config') + total_fields = row.get('total_fields') + total_usages = row.get('total_usages') + total_pages = row.get('total_pages') + + total_fields_sum += total_fields + total_usages_sum += total_usages + total_pages_sum += total_pages + + list_site = pywikibot.Site( + countryconfig.get('lang'), + countryconfig.get('project', u'wikipedia')) + row_template_page = pywikibot.Page( + list_site, + u'Template:{0}'.format(countryconfig.get('rowTemplate'))) + header_template_page = pywikibot.Page( + list_site, + u'Template:{0}'.format(countryconfig.get('headerTemplate'))) + + row_template = row_template_page.title( + asLink=True, withNamespace=False, insite=site) + header_template = header_template_page.title( + asLink=True, withNamespace=False, insite=site) + report_page = row.get('report_page').title( + asLink=True, withNamespace=False, insite=site) + + text += text_row.format( + code=countryconfig.get('country'), + lang=countryconfig.get('lang'), + total_fields=total_fields, + total_usages=total_usages, + total_pages=total_pages, + report_page=report_page, + row_template=row_template, + header_template=header_template) + + text += ( + u'|- class="sortbottom"\n' + u'|style="background-color: #ccc;"|\n' + u'|style="background-color: #ccc;"|\n' + u"| '''{total_fields}'''\n" + u"| '''{total_usages}'''\n" + u"| '''{total_pages}'''\n" + u'|style="background-color: #ccc;"|\n' + u'|style="background-color: #ccc;"|\n' + u'|style="background-color: #ccc;"|\n' + u'|}}\n'.format(total_fields=total_fields_sum, + total_usages=total_usages_sum, + total_pages=total_pages_sum)) + + comment = ( + u'Updating unknown fields statistics. Total of {total_fields} ' + u'unknown fields used {total_usages} times on {total_pages} different ' + u'pages.'.format(total_fields=total_fields_sum, + total_usages=total_usages_sum, + total_pages=total_pages_sum)) + pywikibot.debug(text, _logger) + common.save_to_wiki_or_local(page, comment, text) def main(): @@ -576,6 +702,7 @@ raise Exception(u'The "countrycode" and "langcode" arguments must ' u'be used together.') else: + statistics = [] for (countrycode, lang), countryconfig in mconfig.countries.iteritems(): if (countryconfig.get('skip') or (skip_wd and (countryconfig.get('type') == 'sparql'))): @@ -584,13 +711,15 @@ u'Working on countrycode "%s" in language "%s"' % ( countrycode, lang)) try: - processCountry(countryconfig, conn, cursor, fullUpdate, - daysBack) + statistics.append( + processCountry(countryconfig, conn, cursor, fullUpdate, + daysBack)) except Exception, e: pywikibot.error( u"Unknown error occurred when processing country " u"%s in lang %s\n%s" % (countrycode, lang, str(e))) continue + make_statistics(statistics) close_database_connection(conn, cursor) diff --git a/tests/test_update_database.py b/tests/test_update_database.py index bdb97a1..6b8c13b 100644 --- a/tests/test_update_database.py +++ b/tests/test_update_database.py @@ -1,13 +1,30 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- """Unit tests for update_database.""" import unittest -from collections import Counter +from collections import OrderedDict, Counter import mock import pywikibot from erfgoedbot import update_database + + +class TestCreateReportBase(unittest.TestCase): + + def setUp(self): + patcher = mock.patch( + 'erfgoedbot.update_database.common.save_to_wiki_or_local') + self.mock_save_to_wiki_or_local = patcher.start() + self.addCleanup(patcher.stop) + + # silence logger + patcher = mock.patch( + 'erfgoedbot.update_database.pywikibot.debug') + self.mock_debug = patcher.start() + self.addCleanup(patcher.stop) class TestUpdateDatabaseBase(unittest.TestCase): @@ -338,8 +355,12 @@ self.contents[u'source-field'] = val with self.assertRaises(pywikibot.Error) as cm: - update_database.updateMonument(self.contents, self.source, self.country_config, None, self.mock_cursor, self.mock_page) - self.assertEqual(cm.exception, 'Un-defined check in config for dummy_table: unknown') + update_database.updateMonument( + self.contents, self.source, self.country_config, None, + self.mock_cursor, self.mock_page) + self.assertEqual( + cm.exception, + 'Un-defined check in config for dummy_table: unknown') def test_trigger_problematic_check(self): # It is a known bug that any function can be triggered using a check @@ -354,8 +375,12 @@ self.contents[u'source-field'] = val with self.assertRaises(pywikibot.Error) as cm: - update_database.updateMonument(self.contents, self.source, self.country_config, None, self.mock_cursor, self.mock_page) - self.assertEqual(cm.exception, 'Un-defined check in config for dummy_table: connectDatabase') + update_database.updateMonument( + self.contents, self.source, self.country_config, None, + self.mock_cursor, self.mock_page) + self.assertEqual( + cm.exception, + 'Un-defined check in config for dummy_table: connectDatabase') class TestFormatSourceField(unittest.TestCase): @@ -400,3 +425,410 @@ sources, self.commons, sample_size=2), expected ) + + +class TestMakeStatistics(TestCreateReportBase): + + """Test the make_statistics method.""" + + def setUp(self): + super(TestMakeStatistics, self).setUp() + self.prefix = ( + u'{| class="wikitable sortable"\n' + u'! country ' + u'!! lang ' + u'!! data-sort-type="number"|Total unknown fields ' + u'!! data-sort-type="number"|Total usage of unknown fields ' + u'!! data-sort-type="number"|Total pages containing unknown fields ' + u'!! Report page ' + u'!! Row template ' + u'!! Header template ' + u'\n') + + self.postfix = ( + u'|- class="sortbottom"\n' + u'|style="background-color: #ccc;"|\n' + u'|style="background-color: #ccc;"|\n' + u"| '''{total_fields}'''\n" + u"| '''{total_usages}'''\n" + u"| '''{total_pages}'''\n" + u'|style="background-color: #ccc;"|\n' + u'|style="background-color: #ccc;"|\n' + u'|style="background-color: #ccc;"|\n' + u'|}}\n') + + self.comment = ( + u'Updating unknown fields statistics. Total of {total_fields} ' + u'unknown fields used {total_usages} times on {total_pages} ' + u'different pages.') + commons = pywikibot.Site('commons', 'commons') + self.page = pywikibot.Page( + commons, u'Commons:Monuments database/Unknown fields/Statistics') + + def test_make_statistics_single_basic(self): + test_wiki = pywikibot.Site('test', 'wikipedia') + report_page = pywikibot.Page(test_wiki, 'Foobar') + statistics = [{ + 'config': { + 'lang': 'en', + 'country': 'foo', + 'rowTemplate': 'row template', + 'headerTemplate': 'head template'}, + 'report_page': report_page, + 'total_fields': 123, + 'total_usages': 456, + 'total_pages': 789 + }] + + expected_rows = ( + u'|-\n' + u'| foo \n' + u'| en \n' + u'| 123 \n' + u'| 456 \n' + u'| 789 \n' + u'| [[wikipedia:test:Foobar|Foobar]] \n' + u'| [[wikipedia:en:Template:Row template|Row template]] \n' + u'| [[wikipedia:en:Template:Head template|Head template]] \n') + expected_total_fields = 123 + expected_total_usages = 456 + expected_total_pages = 789 + expected_text = self.prefix + expected_rows + self.postfix.format( + total_fields=expected_total_fields, + total_usages=expected_total_usages, + total_pages=expected_total_pages) + + update_database.make_statistics(statistics) + self.mock_save_to_wiki_or_local.assert_called_once_with( + self.page, + self.comment.format( + total_fields=expected_total_fields, + total_usages=expected_total_usages, + total_pages=expected_total_pages), + expected_text + ) + + def test_make_statistics_single_empty(self): + statistics = [None, ] + + expected_rows = '' + expected_total_fields = 0 + expected_total_usages = 0 + expected_total_pages = 0 + expected_text = self.prefix + expected_rows + self.postfix.format( + total_fields=expected_total_fields, + total_usages=expected_total_usages, + total_pages=expected_total_pages) + + update_database.make_statistics(statistics) + self.mock_save_to_wiki_or_local.assert_called_once_with( + self.page, + self.comment.format( + total_fields=expected_total_fields, + total_usages=expected_total_usages, + total_pages=expected_total_pages), + expected_text + ) + + def test_make_statistics_multiple_basic(self): + test_wiki = pywikibot.Site('test', 'wikipedia') + report_page_1 = pywikibot.Page(test_wiki, 'Foobar') + report_page_2 = pywikibot.Page(test_wiki, 'Barfoo') + statistics = [ + { + 'config': { + 'lang': 'en', + 'country': 'foo', + 'rowTemplate': 'row template', + 'headerTemplate': 'head template'}, + 'report_page': report_page_1, + 'total_fields': 123, + 'total_usages': 456, + 'total_pages': 789 + }, + { + 'config': { + 'lang': 'fr', + 'country': 'bar', + 'rowTemplate': 'row2 template', + 'headerTemplate': 'head2 template'}, + 'report_page': report_page_2, + 'total_fields': 321, + 'total_usages': 654, + 'total_pages': 987 + }] + + expected_rows = ( + u'|-\n' + u'| foo \n' + u'| en \n' + u'| 123 \n' + u'| 456 \n' + u'| 789 \n' + u'| [[wikipedia:test:Foobar|Foobar]] \n' + u'| [[wikipedia:en:Template:Row template|Row template]] \n' + u'| [[wikipedia:en:Template:Head template|Head template]] \n' + u'|-\n' + u'| bar \n' + u'| fr \n' + u'| 321 \n' + u'| 654 \n' + u'| 987 \n' + u'| [[wikipedia:test:Barfoo|Barfoo]] \n' + u'| [[wikipedia:fr:Modèle:Row2 template|Row2 template]] \n' + u'| [[wikipedia:fr:Modèle:Head2 template|Head2 template]] \n') + expected_total_fields = 444 + expected_total_usages = 1110 + expected_total_pages = 1776 + expected_text = self.prefix + expected_rows + self.postfix.format( + total_fields=expected_total_fields, + total_usages=expected_total_usages, + total_pages=expected_total_pages) + + update_database.make_statistics(statistics) + self.mock_save_to_wiki_or_local.assert_called_once_with( + self.page, + self.comment.format( + total_fields=expected_total_fields, + total_usages=expected_total_usages, + total_pages=expected_total_pages), + expected_text + ) + + def test_make_statistics_multiple_mixed(self): + test_wiki = pywikibot.Site('test', 'wikipedia') + report_page = pywikibot.Page(test_wiki, 'Foobar') + statistics = [ + None, + { + 'config': { + 'lang': 'en', + 'country': 'foo', + 'rowTemplate': 'row template', + 'headerTemplate': 'head template'}, + 'report_page': report_page, + 'total_fields': 123, + 'total_usages': 456, + 'total_pages': 789 + }, + None] + + expected_rows = ( + u'|-\n' + u'| foo \n' + u'| en \n' + u'| 123 \n' + u'| 456 \n' + u'| 789 \n' + u'| [[wikipedia:test:Foobar|Foobar]] \n' + u'| [[wikipedia:en:Template:Row template|Row template]] \n' + u'| [[wikipedia:en:Template:Head template|Head template]] \n') + expected_total_fields = 123 + expected_total_usages = 456 + expected_total_pages = 789 + expected_text = self.prefix + expected_rows + self.postfix.format( + total_fields=expected_total_fields, + total_usages=expected_total_usages, + total_pages=expected_total_pages) + + update_database.make_statistics(statistics) + self.mock_save_to_wiki_or_local.assert_called_once_with( + self.page, + self.comment.format( + total_fields=expected_total_fields, + total_usages=expected_total_usages, + total_pages=expected_total_pages), + expected_text + ) + + +class TestUnknownFieldsStatistics(TestCreateReportBase): + + """Test the unknownFieldsStatistics method.""" + + def setUp(self): + super(TestUnknownFieldsStatistics, self).setUp() + self.mock_report_page = mock.create_autospec( + update_database.pywikibot.Page, + ) + patcher = mock.patch( + 'erfgoedbot.update_database.pywikibot.Page') + self.mock_pwb_page = patcher.start() + self.mock_pwb_page.return_value = self.mock_report_page + self.addCleanup(patcher.stop) + + patcher = mock.patch( + 'erfgoedbot.update_database.format_source_field') + self.mock_format_source_field = patcher.start() + self.mock_format_source_field.return_value = 'formatted_entry' + self.addCleanup(patcher.stop) + + self.prefix = ( + u'{{#ifexist:{{FULLPAGENAME}}/header' + u'|{{/header}}' + u'|For information on how to use this report and how to localise ' + u'these instructions visit ' + u'[[:c:Commons:Monuments_database/Unknown fields]]. }}\n') + + self.postfix = ( + u'[[Category:Commons:Monuments database/Unknown fields]]') + + self.comment = u'Updating the list of unknown fields with {0} entries' + self.countryconfig = { + 'table': 'table_name', + 'foo': 'bar' + } + self.commons = pywikibot.Site('commons', 'commons') + + self.unknown_fields = OrderedDict() + self.counter_1 = Counter({'page_11': 1, 'page_12': 5}) + self.unknown_fields['unknown_field_1'] = self.counter_1 + self.counter_2 = Counter({'page_21': 3}) + self.unknown_fields['unknown_field_2'] = self.counter_2 + + def test_unknown_fields_statistics_complete(self): + expected_cmt = self.comment.format(2) + expected_output = self.prefix + ( + u'{| class="wikitable sortable"\n' + u'! Field !! Count !! Sources\n' + u'|-\n' + u'| unknown_field_1 || 6 || formatted_entry\n' + u'|-\n' + u'| unknown_field_2 || 3 || formatted_entry\n' + u'|}\n') + self.postfix + expected_return = { + 'report_page': self.mock_report_page, + 'config': self.countryconfig, + 'total_fields': 2, + 'total_pages': 3, + 'total_usages': 9 + } + + result = update_database.unknownFieldsStatistics( + self.countryconfig, self.unknown_fields) + self.assertEqual(result, expected_return) + self.mock_pwb_page.assert_called_once_with( + self.commons, + u'Commons:Monuments database/Unknown fields/table_name' + ) + self.mock_format_source_field.assert_has_calls([ + mock.call(self.counter_1, self.commons), + mock.call(self.counter_2, self.commons)], + ) + self.mock_save_to_wiki_or_local.assert_called_once_with( + self.mock_report_page, + expected_cmt, + expected_output + ) + + def test_unknown_fields_statistics_no_unknown(self): + expected_cmt = self.comment.format(0) + expected_output = ( + self.prefix + + u'\nThere are no unknown fields left. Great work!\n' + + self.postfix) + expected_return = { + 'report_page': self.mock_report_page, + 'config': self.countryconfig, + 'total_fields': 0, + 'total_pages': 0, + 'total_usages': 0 + } + + result = update_database.unknownFieldsStatistics( + self.countryconfig, {}) + self.assertEqual(result, expected_return) + self.mock_pwb_page.assert_called_once_with( + self.commons, + u'Commons:Monuments database/Unknown fields/table_name' + ) + self.mock_format_source_field.assert_not_called() + self.mock_save_to_wiki_or_local.assert_called_once_with( + self.mock_report_page, + expected_cmt, + expected_output + ) + + def test_unknown_fields_statistics_combine_pages(self): + new_counter = Counter({'page_11': 3, 'page_21': 3, 'page_22': 3}) + self.unknown_fields['unknown_field_2'] = new_counter + expected_cmt = self.comment.format(2) + expected_output = self.prefix + ( + u'{| class="wikitable sortable"\n' + u'! Field !! Count !! Sources\n' + u'|-\n' + u'| unknown_field_1 || 6 || formatted_entry\n' + u'|-\n' + u'| unknown_field_2 || 9 || formatted_entry\n' + u'|}\n') + self.postfix + expected_return = { + 'report_page': self.mock_report_page, + 'config': self.countryconfig, + 'total_fields': 2, + 'total_pages': 4, + 'total_usages': 15 + } + + result = update_database.unknownFieldsStatistics( + self.countryconfig, self.unknown_fields) + self.assertEqual(result, expected_return) + self.mock_pwb_page.assert_called_once_with( + self.commons, + u'Commons:Monuments database/Unknown fields/table_name' + ) + self.mock_format_source_field.assert_has_calls([ + mock.call(self.counter_1, self.commons), + mock.call(new_counter, self.commons)], + ) + self.mock_save_to_wiki_or_local.assert_called_once_with( + self.mock_report_page, + expected_cmt, + expected_output + ) + + +class TestprocessCountry(unittest.TestCase): + + """Test the processCountry method.""" + + def setUp(self): + patcher = mock.patch( + 'erfgoedbot.update_database.process_country_wikidata') + self.mock_process_country_wikidata = patcher.start() + self.addCleanup(patcher.stop) + + patcher = mock.patch( + 'erfgoedbot.update_database.process_country_list') + self.mock_process_country_list = patcher.start() + self.mock_process_country_list.return_value = 'unknown_field_stats' + self.addCleanup(patcher.stop) + + def test_process_country_sparql(self): + config = {'type': 'sparql'} + + result = update_database.processCountry( + config, 'conn', 'cursor', 'fullUpdate', 'daysBack') + self.assertEqual(result, None) + self.mock_process_country_wikidata.assert_called_once_with( + config, 'conn', 'cursor') + self.mock_process_country_list.assert_not_called() + + def test_process_country_list(self): + config = {'type': 'list'} + + result = update_database.processCountry( + config, 'conn', 'cursor', 'fullUpdate', 'daysBack') + self.assertEqual(result, 'unknown_field_stats') + self.mock_process_country_wikidata.assert_not_called() + self.mock_process_country_list.assert_called_once_with( + config, 'conn', 'cursor', 'fullUpdate', 'daysBack') + + def test_process_country_default_to_list(self): + config = {} + + result = update_database.processCountry( + config, 'conn', 'cursor', 'fullUpdate', 'daysBack') + self.assertEqual(result, 'unknown_field_stats') + self.mock_process_country_wikidata.assert_not_called() + self.mock_process_country_list.assert_called_once_with( + config, 'conn', 'cursor', 'fullUpdate', 'daysBack') -- To view, visit https://gerrit.wikimedia.org/r/390169 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I89e5e2e4adcbfe31139fb111a436c6240d0a6c62 Gerrit-PatchSet: 1 Gerrit-Project: labs/tools/heritage Gerrit-Branch: master Gerrit-Owner: Lokal Profil <lokal.pro...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits