Lokal Profil has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/342198 )

Change subject: [WIP]Build fill_table_monuments_all from per dataset dicts.
......................................................................

[WIP]Build fill_table_monuments_all from per dataset dicts.

Intended as a proof-of-concept that the huge sql file can be
replaced by simpler per country dicts.

The outputted sql file could still be submitted to the existing
test suite but building it up in this way increases the chance that
issues are caught even eariler.

* fill_monuments_all_base.py is the workhorse which encodes the sql
  from simpler settings. It is constructed in an extendable way so
  that e.g. wlpa could fairly simply reuse most of it.
* make_fill_monuments_all.py is a simple example of what the per
  country dicts could look like.

Change-Id: I00824bb2cb91b1c83ba7fabfc437bdcc45a23558
---
A erfgoedbot/fill_monuments_all_base.py
A erfgoedbot/make_fill_monuments_all.py
2 files changed, 324 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/labs/tools/heritage 
refs/changes/98/342198/1

diff --git a/erfgoedbot/fill_monuments_all_base.py 
b/erfgoedbot/fill_monuments_all_base.py
new file mode 100644
index 0000000..c4bb080
--- /dev/null
+++ b/erfgoedbot/fill_monuments_all_base.py
@@ -0,0 +1,283 @@
+#!/usr/bin/python
+# -*- coding: utf-8  -*-
+"""Create the fill_table_monuments_all SQL."""
+import os
+from collections import OrderedDict
+import pywikibot
+
+
+class MonumentsAllSql(object):
+
+    """The full fill_monuments_all generation."""
+
+    def __init__(self, datasets):
+        """
+        Initialize the fill_monuments_all SQL object.
+
+        @param datasets: list of MonumentDatasetSql objects.
+        """
+        self.domain = 'monuments'
+        self.datasets = sorted(datasets)
+
+    def make_intro_sql(self):
+        """Construct the opening SQL describing the main table."""
+        sql = \
+"""/*
+ * Create table for all country tables.
+ *
+ * Do not update this file manually.
+ */
+
+-- Update PHP code when changing this
+SET @granularity = 20;
+
+SET NAMES UTF8;
+
+DROP TABLE IF EXISTS `{domain}_all_tmp`;
+
+CREATE TABLE `{domain}_all_tmp` (
+  `country` varchar(10) binary NOT NULL DEFAULT '',
+  `lang` varchar(10) binary NOT NULL DEFAULT '',
+  `project` varchar(150) NOT NULL DEFAULT 'wikipedia',
+  `id` varchar(25) NOT NULL DEFAULT '0',
+  `adm0` varchar(3) binary NOT NULL DEFAULT '',
+  `adm1` varchar(7) binary DEFAULT NULL,
+  `adm2` varchar(100) DEFAULT NULL,
+  `adm3` varchar(150) DEFAULT NULL,
+  `adm4` varchar(200) DEFAULT NULL,
+  `name` varchar(255) NOT NULL DEFAULT '',
+  `address` varchar(255) NOT NULL DEFAULT '',
+  `municipality` varchar(255) NOT NULL DEFAULT '',
+  `lat` double DEFAULT NULL,
+  `lon` double DEFAULT NULL,
+  `lat_int` smallint(6) DEFAULT NULL,
+  `lon_int` smallint(6) DEFAULT NULL,
+  `image` varchar(255) NOT NULL DEFAULT '',
+  `wd_item` varchar(255) DEFAULT NULL,
+  `commonscat` varchar(255) NOT NULL DEFAULT '',
+  `source` varchar(510) NOT NULL DEFAULT '',
+  `changed` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE 
CURRENT_TIMESTAMP,
+  `monument_article` varchar(255) NOT NULL DEFAULT '',
+  `registrant_url` varchar(255) NOT NULL DEFAULT '',
+  PRIMARY KEY (`country`,`lang`,`id`),
+  KEY `admin_levels0` (`adm0`, `lang`, `name`, `country`, `id`),
+  KEY `admin_levels1` (`adm0`, `adm1`, `lang`, `name`, `country`, `id`),
+  KEY `admin_levels2` (`adm2`(32), `lang`, `name`, `country`, `id`),
+  KEY `admin_levels3` (`adm3`(32), `lang`, `name`, `country`, `id`),
+  KEY `admin_levels4` (`adm4`(32), `lang`, `name`, `country`, `id`),
+  KEY `name` (`name`),
+  KEY `coord` (`lat_int`,`lon_int`,`lat`),
+  FULLTEXT KEY `name_address_ft` (`name`, `address`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8;
+
+TRUNCATE TABLE `{domain}_all_tmp`;
+"""
+        return sql.format(domain=self.domain)
+
+    def make_outro_sql(self):
+        """Construct the ending SQL replacing the real table by the new one."""
+        sql = "-- UPDATE `{domain}_all_tmp` SET " \
+              "lat_int = ROUND(lat * @granularity), " \
+              "lon_int = ROUND(lon * @granularity);\n" \
+              "\n" \
+              "/* when both lat and lon = 0 something went wrong */\n" \
+              "UPDATE `{domain}_all_tmp`\n" \
+              "SET `lat`=NULL, `lon`=NULL, `lat_int`=NULL, `lon_int`=NULL\n" \
+              "WHERE `lat`=0 AND `lon`=0;\n" \
+              "\n" \
+              "DROP TABLE IF EXISTS `{domain}_all`;\n" \
+              "\n" \
+              "ALTER TABLE `{domain}_all_tmp` RENAME TO `{domain}_all`;"
+        return sql.format(domain=self.domain)
+
+    def get_sql(self):
+        """Construct the full SQL needed to insert all datasets."""
+        return "{intro}\n{data}{outro}".format(
+            intro=self.make_intro_sql(),
+            data='\n'.join([dataset.get_sql() for dataset in self.datasets]),
+            outro=self.make_outro_sql())
+
+    def write_sql(self):
+        """Output the SQL to the correct file."""
+        filename = "fill_table_{domain}_all.sql".format(domain=self.domain)
+        sql_dir = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "sql")
+        with open(os.path.join(sql_dir, filename), "w") as f:
+            f.write(self.get_sql())
+
+
+class MonumentDatasetSql(object):
+
+    """A single dataset (country) in the fill_monuments_all generation."""
+
+    def __init__(self, country, language, replacements):
+        """
+        Initialize the dataset SQL object.
+
+        @param country: Full text name of country or dataset. E.g. 'Andorra'
+            or 'Sweden (Listed historical ships)'.
+        @param language: Full text name of the language. E.g. 'Swedish'.
+        @param replacements: Dictionary with target variable as key and
+            replacement SQL a value. e.g. {"adm0": "'ad'", "lat": "`lat`"}
+        """
+        self.country = country
+        self.language = language
+        self.domain = 'monuments'
+        self.dataset = None  # the dataset/country code
+        self.lang = None  # the language code
+
+        self.variables = OrderedDict([
+            ('country', None),
+            ('lang', None),
+            ('project', None),
+            ('id', None),
+            ('adm0', None),
+            ('adm1', None),
+            ('adm2', None),
+            ('adm3', None),
+            ('adm4', None),
+            ('name', None),
+            ('address', None),
+            ('municipality', None),
+            ('lat', None),
+            ('lon', None),
+            ('lat_int', 'ROUND(`lat` * @granularity)'),
+            ('lon_int', 'ROUND(`lon` * @granularity)'),
+            ('image', None),
+            ('wd_item', None),
+            ('commonscat', None),
+            ('source', None),
+            ('changed', None),
+            ('monument_article', None),
+            ('registrant_url', None)
+            ])
+
+        self.load_values(replacements)
+
+    def __lt__(self, other):
+        """
+        Implement less than to allow for sorting.
+
+        Sorting is done on table name.
+
+        @param other: a second MonumentDatasetSql object.
+        """
+        return self.get_table_name() < other.get_table_name()
+
+    def get_table_name(self):
+        """Return the table name."""
+        return "{domain}_{country}_({lang})".format(
+            domain=self.domain,
+            country=self.dataset,
+            lang=self.lang)
+
+    def get_replaced(self):
+        """Return the list of replaced variables."""
+        replaced = []
+        for variable, value in self.variables.iteritems():
+            if value is not None:
+                replaced.append(variable)
+        return replaced
+
+    def get_sql(self):
+        """Construct the full SQL needed to insert a dataset."""
+        sql = "{intro}{variable}\n" \
+              "    FROM `{table}`;\n"
+        return sql.format(
+            intro=self.make_intro_sql(),
+            variable=self.make_varible_sql(),
+            table=self.get_table_name())
+
+    def make_intro_sql(self):
+        """Construct the opening SQL listing fields to be replaced."""
+        replaced = self.get_replaced()
+        sql = "/* {dataset} in {lang} */\n" \
+              "REPLACE INTO\n" \
+              "  `{domain}_all_tmp` (\n" \
+              "    {replaced}\n" \
+              ") SELECT\n"
+        return sql.format(
+            dataset=self.country,
+            lang=self.language,
+            domain=self.domain,
+            replaced=", ".join(["`{}`".format(x) for x in replaced]))
+
+    def make_varible_sql(self):
+        """Construct the SQL for mapping country tables variables."""
+        sql_lines = []
+        for variable, value in self.variables.iteritems():
+            if value is not None:
+                sql_lines.append("    {val} AS `{var}`".format(
+                    val=value, var=variable))
+        return ",\n".join(sql_lines)
+
+    def load_values(self, replacements):
+        """
+        Load the dataset specific replacements into self.variables.
+
+        Does not accept variables other than those in self.variables.
+        Automatically handles lat_int and lon_int.
+
+        @param replacements: Dictionary with target variable as key and
+            replacement SQL a value. e.g. {"adm0": "'ad'", "lat": "`lat`"}
+        """
+        if 'country' not in replacements or 'lang' not in replacements:
+            raise ValueError("Both 'country' and 'lang' must be replaced")
+        self.dataset = replacements['country'].text
+        self.lang = replacements['lang'].text
+
+        if 'lat' not in replacements:
+            self.variables['lat_int'] = None
+        if 'lon' not in replacements:
+            self.variables['lon_int'] = None
+
+        for variable in self.variables:
+            if variable in replacements:
+                value = replacements[variable]
+                if not isinstance(value, VariableType):
+                    raise ValueError(
+                        "All variables must be encoded through VariableType, "
+                        "'{}' was not".format(variable))
+                self.variables[variable] = value.format()
+
+        for target in replacements:
+            if target not in self.variables:
+                pywikibot.warning(
+                    "Unrecognized variable in {table}: {variable}".format(
+                        table=self.get_table_name(), variable=target))
+
+
+class VariableType(object):
+
+    """Abstract class to hold the variable type for a replacement variable."""
+
+    def __init__(self, text):
+        self.text = text
+
+    def format(self):
+        """Output the variable in a sql compatible mode."""
+        raise NotImplementedError
+
+
+class Text(VariableType):
+
+    """A plain string."""
+
+    def format(self):
+        return "'{}'".format(self.text)
+
+
+class Field(VariableType):
+
+    """A field reference."""
+
+    def format(self):
+        return "`{}`".format(self.text)
+
+
+class Raw(VariableType):
+
+    """Raw sql."""
+
+    def format(self):
+        return self.text
diff --git a/erfgoedbot/make_fill_monuments_all.py 
b/erfgoedbot/make_fill_monuments_all.py
new file mode 100644
index 0000000..4a9a890
--- /dev/null
+++ b/erfgoedbot/make_fill_monuments_all.py
@@ -0,0 +1,41 @@
+#!/usr/bin/python
+# -*- coding: utf-8  -*-
+"""Create the fill_table_monuments_all SQL."""
+from erfgoedbot.fill_monuments_all_base import (
+    MonumentsAllSql,
+    MonumentDatasetSql,
+    Text,
+    Field,
+    Raw
+)
+
+
+def make_ad_ca():
+    data = {
+        'country': Text('ad'),
+        'lang': Text('ca'),
+        'id': Field('id'),
+        'adm0': Text('ad'),
+        'adm1': Raw('LOWER(`region`)'),
+        'name': Field('nomcoor'),
+        'address': Field('lloc'),
+        'municipality': Field('municipi'),
+        'lat': Field('lat'),
+        'lon': Field('lon'),
+        'image': Field('imatge'),
+        'commonscat': Field('commonscat'),
+        'source': Field('source'),
+        'changed': Field('changed'),
+        'monument_article': Field('monument_article'),
+        'registrant_url': Field('registrant_url'),
+        'wd_item': Field('wd_item')
+    }
+    return MonumentDatasetSql('Andorra', 'Catalan', data)
+
+
+def print_all():
+    ad_ca = make_ad_ca()
+    print MonumentsAllSql([ad_ca, ]).get_sql()
+
+if __name__ == "__main__":
+    print_all()

-- 
To view, visit https://gerrit.wikimedia.org/r/342198
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I00824bb2cb91b1c83ba7fabfc437bdcc45a23558
Gerrit-PatchSet: 1
Gerrit-Project: labs/tools/heritage
Gerrit-Branch: master
Gerrit-Owner: Lokal Profil <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to