Rush has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/325949 )

Change subject: labsdb: cleanup maintain-meta_p enough to make it viable
......................................................................

labsdb: cleanup maintain-meta_p enough to make it viable

much of this mirrors maintain-views behavior

* converting to python2 for now to use requests
* use local socket
* use common path for mediawiki-config
* visual editor is now a default so handle nondefault
* allow specification of all or listed dbs
* add dry-run arg

Change-Id: Ie839af3d5687354dc50e8cb412909616eb107a1d
---
M modules/role/files/labs/db/views/maintain-meta_p.py
M modules/role/manifests/labs/db/views.pp
2 files changed, 269 insertions(+), 161 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/49/325949/1

diff --git a/modules/role/files/labs/db/views/maintain-meta_p.py 
b/modules/role/files/labs/db/views/maintain-meta_p.py
index 33fa96a..b5403eb 100644
--- a/modules/role/files/labs/db/views/maintain-meta_p.py
+++ b/modules/role/files/labs/db/views/maintain-meta_p.py
@@ -1,5 +1,5 @@
-#! /usr/bin/python3
-# -*- coding: utf-8 -*-
+#! /usr/bin/env python
+# -*- encoding: utf-8 -*-
 
 #  Based on work by Marc-André Pelletier, ported to Python by Alex Monk
 #  Copyright © 2015 Alex Monk <kren...@gmail.com>
@@ -17,166 +17,264 @@
 #  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 #  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #
-#
-# This script sets up tables of metainformation on each slice (in the meta_p
-#  database).
-#
-#  The script expects to be invoked in a fresh copy of
-#  operations/mediawiki-config where it will get most of its information,
-#  and will connect to each wiki through the API to get the rest.
-#
-#  It connects to the slices with the credentials in the invoking
-#  user's .my.cnf, but is probably only useful if those credentials
-#  have full control over the meta_p database on each slice to be processed.
-#
 
-slices = [
-    ('labsdb1001.eqiad.wmnet', 3306),
-    ('labsdb1002.eqiad.wmnet', 3306),
-    ('labsdb1003.eqiad.wmnet', 3306)
-]
-
-import codecs
-import collections
-from configparser import ConfigParser
-import json
+import argparse
 import logging
+import os
 import pymysql
+import requests
 import re
+import simplejson as json
 import subprocess
+import sys
 import urllib
-import urllib.request
-
-config = ConfigParser()
-config.read('.my.cnf')
-
-# Strip first and last characters - just apostrophes
-dbuser = config.get('client', 'user')[1:-1]
-dbpassword = config.get('client', 'password')[1:-1]
-
-subprocess.call(["git", "pull"], cwd="mediawiki-config")
-
-dbs = {db: {} for db in 
open('mediawiki-config/all.dblist').read().splitlines()}
+import yaml
 
 
-def read_list(listFname, prop, val):
-    for db in open('mediawiki-config/' + listFname + 
'.dblist').read().splitlines():
-        if db in dbs:
-            dbs[db][prop] = val
+class SchemaOperations():
+    def __init__(self, dry_run, db, cursor):
+        self.dry_run = dry_run
+        self.db = db
+        self.cursor = cursor
 
-read_list("closed", "closed", True)
-read_list("deleted", "deleted", True)
-read_list("small", "size", 1)
-read_list("medium", "size", 2)
-read_list("large", "size", 3)
-read_list("private", "private", True)
-read_list("special", "family", "special")
-read_list("flaggedrevs", "has_flaggedrevs", True)
-read_list("visualeditor-default", "has_visualeditor", True)
-read_list("wikidataclient", "has_wikidata", True)
+    def write_execute(self, query):
+        """ Do operation or simulate
+        :param query: str
+        """
+        logging.debug("SQL: {}".format(query.encode('utf-8')))
+        if not self.dry_run:
+            self.cursor.execute(query)
 
-# TODO: silver/labtestweb2001
-for slice in ['s1', 's2', 's3', 's4', 's5', 's6', 's7']:
-    read_list(slice, "slice", slice)
 
-for family in ["wikibooks", "wikidata", "wikinews", "wikiquote", "wikisource",
-               "wikiversity", "wikivoyage", "wiktionary", "wikimania", 
"wikimedia",
-               "wikipedia"]:
-    read_list(family, "family", family)
+def force_to_unicode(text):
+    """ Ouput unicode or else
+    :param text: str
+    """
+    return text if isinstance(text, unicode) else text.decode('utf8')
 
-# Sadly, case sensitivity of titles isn't in a .dblist, nor is it
-# exposed through the API so we have to hardcode it here to match
-# what is in InitialiseSettings.php
-read_list("wiktionary", "sensitive", True)
-dbs['jbowiki']['sensitive'] = True
 
-inCanonConfig = False
-canonical = {}
-for line in 
open('mediawiki-config/wmf-config/InitialiseSettings.php').read().splitlines():
-    if line == "'wgCanonicalServer' => array(":
-        inCanonConfig = True
-    elif inCanonConfig and line == "),":
-        inCanonConfig = False
-    else:
-        matches = re.match("^\s+'(.*)'\s+=>\s+'(.*)'\s*,\s*$", line)
-        if inCanonConfig and matches:
-            canonical[matches.group(1)] = matches.group(2)
+def parse_php_canonical(mfile):
+    """ Given the 'initializesettings.php' file pull out the
+    list of canonical urls the hard way
+    :param mfile: str
+    """
 
-cached = collections.defaultdict(dict)
-try:
-    with open('wiki-cache.json') as cacheFile:
-        cached = json.load(cacheFile)
-except IOError as e:
-    pass
+    with open(mfile, 'r') as f:
+        fcontent = f.read()
+        data_structures = re.split('\n],', fcontent)
+        for d in data_structures:
+            if "wgCanonicalServer' => [" in d:
+                rawCanonicalServer = d
 
-for db, dbInfo in dbs.items():
-    if 'private' in dbInfo and dbInfo['private']:
-        continue
-    elif 'deleted' in dbInfo and dbInfo['deleted']:
-        continue
+    canonical_servers = {}
+    for line in rawCanonicalServer.split('\n'):
+        if '=>' in line:
+            key, value = line.split('=>')
+            key = key.rstrip().lstrip().strip("'")
+            if ',' in value:
+                value = value.split(',')[0]
+            value = value.strip().strip("'")
+            canonical_servers[key] = value
+    return canonical_servers
 
-    canon = None
-    if db in canonical:
-        canon = canonical[db]
-    else:
-        matches = re.match("^(.*)(wik[it].*)", db)
-        if matches:
-            lang = matches.group(1)
-            canon = canonical[dbInfo['family']].replace('$lang', lang)
 
-    if canon:
-        canon = canon.replace('_', '-')
-        dbInfo['url'] = canon
-        if canon in cached:
-            dbInfo['lang'] = cached[canon]['lang']
-            dbInfo['name'] = cached[canon]['name']
-        else:
-            logging.info("Querying " + canon + "...")
-            try:
-                url_tail = 
"/w/api.php?action=query&meta=siteinfo&siprop=general&format=json"
-                req = urllib.request.Request(canon + url_tail)
-                req.add_header("User-Agent", "operations/software.git 
maintain-meta_p.py")
+def seed_schema(ops):
 
-                with urllib.request.urlopen(req) as response:
-                    result = 
json.load(codecs.getreader("utf-8")(response))['query']
-                    cached[canon]['lang'] = dbInfo['lang'] = 
result['general']['lang']
-                    cached[canon]['name'] = dbInfo['name'] = 
result['general']['sitename']
-            except Exception as e:
-                logging.exception(e)
+    ops.write_execute("CREATE DATABASE IF NOT EXISTS meta_p DEFAULT CHARACTER 
SET utf8;")
+    ops.write_execute("""CREATE TABLE IF NOT EXISTS meta_p.wiki (
+            dbname varchar(32) PRIMARY KEY,
+            lang varchar(12) NOT NULL DEFAULT 'en',
+            name text,
+            family text,
+            url text,
+            size numeric(1) NOT NULL DEFAULT 1,
+            slice text NOT NULL,
+            is_closed numeric(1) NOT NULL DEFAULT 0,
+            has_echo numeric(1) NOT NULL DEFAULT 1,
+            has_flaggedrevs numeric(1) NOT NULL DEFAULT 0,
+            has_visualeditor numeric(1) NOT NULL DEFAULT 0,
+            has_wikidata numeric(1) NOT NULL DEFAULT 0,
+            is_sensitive numeric(1) NOT NULL DEFAULT 0);
+            """)
 
-with open('wiki-cache.json', 'w') as cacheFile:
-    json.dump(cached, cacheFile)
-
-for dbhost, dbport in slices:
-    dbh = pymysql.connect(host=dbhost, port=dbport, user=dbuser, 
passwd=dbpassword, charset='utf8')
-    cursor = dbh.cursor()
-
-    logging.info("Update/create meta tables on", dbhost + ":" + str(dbport) + 
"...")
-    cursor.execute("CREATE DATABASE IF NOT EXISTS meta_p DEFAULT CHARACTER SET 
utf8;")
-    cursor.execute("""CREATE TABLE IF NOT EXISTS meta_p.wiki (
-        dbname varchar(32) PRIMARY KEY,
-        lang varchar(12) NOT NULL DEFAULT 'en',
-        name text,
-        family text,
-        url text,
-        size numeric(1) NOT NULL DEFAULT 1,
-        slice text NOT NULL,
-        is_closed numeric(1) NOT NULL DEFAULT 0,
-        has_echo numeric(1) NOT NULL DEFAULT 1,
-        has_flaggedrevs numeric(1) NOT NULL DEFAULT 0,
-        has_visualeditor numeric(1) NOT NULL DEFAULT 0,
-        has_wikidata numeric(1) NOT NULL DEFAULT 0,
-        is_sensitive numeric(1) NOT NULL DEFAULT 0);""")
-    cursor.execute("""CREATE OR REPLACE VIEW meta_p.legacy AS
+    ops.write_execute("""CREATE OR REPLACE VIEW meta_p.legacy AS
         SELECT dbname, lang, family, NULL AS domain, size, 0 AS is_meta,
                is_closed, 0 AS is_multilang, (family='wiktionary') AS 
is_sensitive,
                NULL AS root_category, slice AS server, '/w/' AS script_path
             FROM meta_p.wiki;""")
-    cursor.execute("""CREATE TABLE IF NOT EXISTS 
meta_p.properties_anon_whitelist (
+
+    ops.write_execute("""CREATE TABLE IF NOT EXISTS 
meta_p.properties_anon_whitelist (
         pw_property varbinary(255) PRIMARY KEY);""")
-    cursor.execute("START TRANSACTION;")
-    cursor.execute("TRUNCATE meta_p.wiki;")
+
+    ops.write_execute("START TRANSACTION;")
+    ops.write_execute("TRUNCATE meta_p.wiki;")
+
+
+def main():
+
+    argparser = argparse.ArgumentParser(
+        "maintain-meta_p",
+        description="Maintain metadatabase of wiki's"
+    )
+
+    group = argparser.add_mutually_exclusive_group(required=True)
+    group.add_argument(
+        '--databases',
+        help=("Specify database(s) to work on, instead of all. Multiple"
+              " values can be given space-separated."),
+        nargs="+"
+    )
+    group.add_argument(
+        '--all-databases',
+        help='Flag to run through all possible databases',
+        action='store_true',
+    )
+
+    argparser.add_argument(
+        "--dry-run",
+        help=("Give this parameter if you don't want the script to actually"
+              " make changes."),
+        action="store_true"
+    )
+
+    # piggyback on maintain-views for now
+    argparser.add_argument(
+        "--config-location",
+        help="Path to find the configuration file",
+        default="/etc/maintain-views.yaml"
+    )
+
+    argparser.add_argument(
+        "--mediawiki-config",
+        help=("Specify path to mediawiki-config checkout"
+              " values can be given space-separated."),
+        default="/usr/local/lib/mediawiki-config"
+    )
+
+    argparser.add_argument(
+        '--debug',
+        help='Turn on debug logging',
+        action='store_true'
+    )
+
+    args = argparser.parse_args()
+
+    logging.basicConfig(
+        format='%(asctime)s %(levelname)s %(message)s',
+        level=logging.DEBUG if args.debug else logging.INFO,
+    )
+
+    logging.debug(args)
+
+    with open(args.config_location, 'r') as stream:
+        try:
+            config = yaml.load(stream)
+        except yaml.YAMLError as exc:
+            logging.critical(exc)
+            sys.exit(1)
+
+    dbh = pymysql.connect(
+        user=config["mysql_user"],
+        passwd=config["mysql_password"],
+        unix_socket="/tmp/mysql.sock",
+        charset="utf8"
+    )
+
+    ops = SchemaOperations(args.dry_run,
+                           dbh,
+                           dbh.cursor())
+
+    seed_schema(ops)
+
+    alldbs = '{}/dblists/all.dblist'.format(args.mediawiki_config)
+    dbs = {db: {"has_visualeditor": True}
+           for db in open(alldbs).read().splitlines()}
+
+    if args.databases:
+        dbs = {k: v for k, v in dbs.iteritems() if k in args.databases}
+
+    def read_list(fname, prop, val):
+        fpath = os.path.join('{}/dblists/'.format(args.mediawiki_config),
+                             fname + '.dblist')
+        for db in open(fpath).read().splitlines():
+            if db in dbs:
+                dbs[db][prop] = val
+
+    read_list("closed", "closed", True)
+    read_list("deleted", "deleted", True)
+    read_list("small", "size", 1)
+    read_list("medium", "size", 2)
+    read_list("large", "size", 3)
+    read_list("private", "private", True)
+    read_list("special", "family", "special")
+    read_list("flaggedrevs", "has_flaggedrevs", True)
+    read_list("visualeditor-nondefault", "has_visualeditor", False)
+    read_list("wikidataclient", "has_wikidata", True)
+
+    # TODO: silver/labtestweb2001
+    for slice in ['s1', 's2', 's3', 's4', 's5', 's6', 's7']:
+        read_list(slice, "slice", slice)
+
+    for family in [
+        "wikibooks",
+        "wikidata",
+        "wikinews",
+        "wikiquote",
+        "wikisource",
+        "wikiversity",
+        "wikivoyage",
+        "wiktionary",
+        "wikimania",
+        "wikimedia",
+        "wikipedia",
+    ]:
+        read_list(family, "family", family)
+
+    # case sensitivity of titles isn't in a .dblist, nor is it
+    # exposed through the API so we have to hardcode it here to match
+    # what is in InitialiseSettings.php
+    read_list("wiktionary", "sensitive", True)
+    if 'jbowiki' in dbs:
+        dbs['jbowiki']['sensitive'] = True
+
+    initialise_settings = 
'{}/wmf-config/InitialiseSettings.php'.format(args.mediawiki_config)
+    canonical = parse_php_canonical(initialise_settings)
+
     for db, dbInfo in dbs.items():
+
+        logging.debug("collecting action api info for {}".format(db))
+        if 'private' in dbInfo and dbInfo['private']:
+            continue
+
+        elif 'deleted' in dbInfo and dbInfo['deleted']:
+            continue
+
+        if db in canonical:
+            url = canonical[db]
+        else:
+            lang = db[:2]
+            url = canonical[dbInfo['family']].replace('$lang', lang)
+
+        if url:
+            dbInfo['url'] = url
+            canon = url.replace('_', '-')
+            try:
+                url_tail = 
"/w/api.php?action=query&meta=siteinfo&siprop=general&format=json"
+                header = {"User-Agent": "Labsdb maintain-meta_p.py"}
+                r = requests.get(canon + url_tail, headers=header)
+                request = r.content
+                siteinfo = json.loads(request)
+                name = 
force_to_unicode(siteinfo['query']['general']['sitename'])
+                lange = force_to_unicode(siteinfo['query']['general']['lang'])
+                dbInfo['name'] = name
+                dbInfo['lang'] = lang
+                logging.debug('collected name ({}) and lang ({}) from 
api'.format(name, lang))
+            except Exception as e:
+                logging.warning('failed request for {}'.format(canon))
+
+    for db, dbInfo in dbs.items():
+
+        logging.debug("update meta_p for {}".format(db))
         if 'deleted' in dbInfo and dbInfo['deleted']:
             continue
         elif 'private' in dbInfo and dbInfo['private']:
@@ -206,6 +304,7 @@
             'family': None,
             'name': None
         }
+
         if 'url' in dbInfo:
             fields['url'] = dbInfo['url']
         if 'family' in dbInfo:
@@ -213,26 +312,30 @@
         if 'lang' in dbInfo:
             fields['lang'] = dbInfo['lang']
         if 'name' in dbInfo:
-            fields['name'] = dbInfo['name']
+            fields['name'] = force_to_unicode(dbInfo['name'])
         if 'size' in dbInfo:
             fields['size'] = dbInfo['size']
-        cursor.execute(
-            "INSERT INTO meta_p.wiki " +
-            "(has_flaggedrevs, has_visualeditor, " +
-            "has_wikidata, is_closed, is_sensitive, dbname, slice, " +
-            "url, family, lang, name, size) " +
-            "VALUES (%(has_flaggedrevs)s, %(has_visualeditor)s, " +
-            "%(has_wikidata)s, %(is_closed)s, " +
-            "%(is_sensitive)s, %(dbname)s, %(slice)s, %(url)s, %(family)s, 
%(lang)s, " +
-            "%(name)s, %(size)s);",
-            fields
-        )
 
-    cursor.execute("COMMIT;")
-    cursor.execute("START TRANSACTION;")
-    cursor.execute("DELETE FROM meta_p.properties_anon_whitelist;")
-    # This is hardcoded for now
-    cursor.execute("INSERT INTO meta_p.properties_anon_whitelist VALUES 
('gadget-%');")
-    cursor.execute("COMMIT;")
+        ops.write_execute("""INSERT INTO meta_p.wiki
+             (dbname, lang, name, family,
+              url, size, slice, is_closed,
+              has_flaggedrevs, has_visualeditor, has_wikidata,
+              is_sensitive)
+              VALUES
+              ('%(dbname)s', '%(lang)s', '%(name)s', '%(family)s',
+              '%(url)s', %(size)s, '%(slice)s', %(is_closed)s,
+              %(has_flaggedrevs)s, %(has_visualeditor)s, %(has_wikidata)s,
+              %(is_sensitive)s);""" % fields)
 
-logging.info("All done.")
+        ops.write_execute("COMMIT;")
+
+        ops.write_execute("COMMIT;")
+        ops.write_execute("START TRANSACTION;")
+        ops.write_execute("DELETE FROM meta_p.properties_anon_whitelist;")
+
+        # This is hardcoded for now
+        ops.write_execute("INSERT INTO meta_p.properties_anon_whitelist VALUES 
('gadget-%');")
+        ops.write_execute("COMMIT;")
+
+if __name__ == '__main__':
+    main()
diff --git a/modules/role/manifests/labs/db/views.pp 
b/modules/role/manifests/labs/db/views.pp
index 82a8e50..e2abccf 100644
--- a/modules/role/manifests/labs/db/views.pp
+++ b/modules/role/manifests/labs/db/views.pp
@@ -1,6 +1,10 @@
 # deploy scripts and its dependencies to create replica views
 class role::labs::db::views {
 
+    package { ['python-simplejson', 'python-pymysql':
+        ensure => present,
+    }
+
     include passwords::labsdb::maintainviews
     $view_user = $::passwords::labsdb::maintainviews::user
     $view_pass = $::passwords::labsdb::maintainviews::db_pass
@@ -24,11 +28,12 @@
     }
 
     file { '/usr/local/sbin/maintain-meta_p':
-        ensure => file,
-        source => 'puppet:///modules/role/labs/db/views/maintain-meta_p.py',
-        owner  => 'root',
-        group  => 'root',
-        mode   => '0655',
+        ensure  => file,
+        source  => 'puppet:///modules/role/labs/db/views/maintain-meta_p.py',
+        owner   => 'root',
+        group   => 'root',
+        mode    => '0655',
+        require => [Package['python-simplejson', 'python-pymysql']],
     }
 
     file { '/usr/local/src/heartbeat-views.sql':

-- 
To view, visit https://gerrit.wikimedia.org/r/325949
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie839af3d5687354dc50e8cb412909616eb107a1d
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Rush <r...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to