Alex Monk has uploaded a new change for review.
https://gerrit.wikimedia.org/r/295607
Change subject: [WIP/POC/POS] Add python version of maintain-replicas script
......................................................................
[WIP/POC/POS] Add python version of maintain-replicas script
This does not include functionality that was split into maintain-meta_p.py last
year.
This (so far) deals with the following issues:
* It was written in Perl.
* Configuration was part of the script.
* It tried to parse ~/.my.cnf instead of getting them from it's own
configuration.
* Configuration file couldn't be specified on the CLI
It still needs to:
* Allow specifying one database to work on only.
* BE TESTED.
* Be commented more
* Do less work in file scope?
* Pause and ask for confirmation of things
* Have a "dry run" mode.
* Be more verbose.
* Be more helpful to the user in case of failure.
Change-Id: Icc51fc9a23f2fcf4c19c7adc092662d7122471e1
---
A maintain-replicas/config.json
A maintain-replicas/maintain-replicas.py
2 files changed, 312 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/software
refs/changes/07/295607/1
diff --git a/maintain-replicas/config.json b/maintain-replicas/config.json
new file mode 100644
index 0000000..8f9d3ff
--- /dev/null
+++ b/maintain-replicas/config.json
@@ -0,0 +1,158 @@
+{
+ "slices": [
+ ["labsdb1001.eqiad.wmnet", 3306],
+ ["labsdb1002.eqiad.wmnet", 3306],
+ ["labsdb1003.eqiad.wmnet", 3306]
+ ],
+ "mysql_user": "root",
+ "mysql_password": "password",
+ "fullviews": [
+ "abuse_filter_action", "abuse_filter_history",
+ "aft_article_answer", "aft_article_answer_text",
+ "aft_article_feedback_properties",
"aft_article_feedback_ratings_rollup",
+ "aft_article_feedback_select_rollup", "aft_article_field",
"aft_article_field_group",
+ "aft_article_field_option", "aft_article_filter_count",
"aft_article_revision_feedback_ratings_rollup",
+ "aft_article_revision_feedback_select_rollup", "article_assessment",
"article_assessment_pages",
+ "article_assessment_ratings", "article_feedback",
"article_feedback_pages",
+ "article_feedback_properties", "article_feedback_ratings",
"article_feedback_revisions",
+ "article_feedback_stats", "article_feedback_stats_types", "category",
"categorylinks", "change_tag",
+ "ep_articles", "ep_cas", "ep_courses", "ep_events", "ep_instructors",
"ep_oas", "ep_orgs",
+ "ep_revisions", "ep_students", "ep_users_per_course", "externallinks",
"flaggedimages",
+ "flaggedpage_config", "flaggedpage_pending", "flaggedpages",
"flaggedrevs", "flaggedrevs_promote",
+ "flaggedrevs_statistics", "flaggedrevs_stats", "flaggedrevs_stats2",
"flaggedrevs_tracking",
+ "flaggedtemplates", "geo_killlist", "geo_tags", "geo_updates",
"globalimagelinks", "global_block_whitelist",
+ "hashs", "hitcounter", "image", "imagelinks", "imagelinks_old",
"interwiki", "iwlinks",
+ "l10n_cache", "langlinks", "links", "localisation",
"localisation_file_hash",
+ "math", "module_deps", "msg_resource_links", "namespaces",
+ "page", "page_broken", "pagelinks", "page_props", "page_restrictions",
"pagetriage_log",
+ "pagetriage_page", "pagetriage_page_tags", "pagetriage_tags",
"pif_edits", "povwatch_log",
+ "povwatch_subscribers", "protected_titles", "pr_index", "redirect",
"renameuser_status", "site_identifiers",
+ "sites", "site_stats", "tag_summary", "templatelinks", "transcode",
"updatelog", "updates",
+ "user_former_groups", "user_groups", "valid_tag", "wikilove_image_log",
+ "wikilove_log", "global_group_permissions",
"global_group_restrictions", "global_user_groups",
+ "globalblocks", "localuser", "wikiset", "wb_changes",
"wb_changes_dispatch", "wb_changes_subscription",
+ "wb_entity_per_page", "wb_id_counters", "wb_items_per_site",
"wb_property_info", "wb_terms",
+ "wbc_entity_usage", "wbs_propertypairs"
+ ],
+ "logging_whitelist": [
+ "gblrights", "globalauth", "close", "lock", "gblblock", "managetags",
"gather", "campus",
+ "institution", "instructor", "online", "merge", "massmessage",
"course", "stable", "import",
+ "abusefilter", "student", "eparticle", "rights",
"pagetriage-deletion", "protect",
+ "pagetriage-curation", "thanks", "renameuser", "review", "block",
"upload", "move",
+ "delete", "patrol", "newusers"
+ ],
+ "customviews": {
+ "abuse_filter": {
+ "source": "abuse_filter",
+ "view": "select af_id, if(af_hidden,null,af_pattern) as
af_pattern, af_user, af_user_text, af_timestamp, af_enabled,
if(af_hidden,null,af_comments) as af_comments, af_public_comments, af_hidden,
af_hit_count, af_throttled, af_deleted, af_actions, af_global, af_group"
+ },
+ "abuse_filter_log": {
+ "source": "abuse_filter_log",
+ "view": "select afl_id, afl_filter, afl_user, afl_user_text, NULL
as afl_ip, afl_action, afl_actions, afl_var_dump, afl_timestamp, afl_namespace,
afl_title, afl_wiki, afl_deleted, afl_patrolled_by, afl_rev_id, afl_log_id"
+ },
+ "aft_article_feedback": {
+ "source": "aft_article_feedback",
+ "view": "select af_id, af_page_id, af_user_id, NULL as af_user_ip,
af_user_anon_token, af_revision_id, af_cta_id, af_link_id, af_created,
af_abuse_count, af_helpful_count, af_unhelpful_count, af_oversight_count,
af_is_deleted, af_is_hidden, af_net_helpfulness, af_has_comment,
af_is_unhidden, af_is_undeleted, af_is_declined, af_activity_count, af_form_id,
af_experiment, af_suppress_count, af_last_status, af_last_status_user_id,
af_last_status_timestamp, af_is_autohide, af_is_unrequested, af_is_featured,
af_is_unfeatured, af_is_resolved, af_is_unresolved, af_relevance_score,
af_relevance_sort, af_last_status_notes"
+ },
+ "archive": {
+ "source": "archive",
+ "view": "select ar_id, ar_namespace, ar_title, NULL as ar_text,
NULL as ar_comment, if(ar_deleted&4,null,ar_user) as ar_user,
if(ar_deleted&4,null,ar_user_text) as ar_user_text, ar_timestamp,
ar_minor_edit, ar_flags, ar_rev_id, if(ar_deleted&1,null,ar_text_id) as
ar_text_id, ar_deleted, if(ar_deleted&1,null,ar_len) as ar_len, ar_page_id,
ar_parent_id, if(ar_deleted&1,null,ar_sha1) as ar_sha1"
+ },
+ "archive_userindex": {
+ "source": "archive",
+ "view": "select ar_id, ar_namespace, ar_title, NULL as ar_text,
NULL as ar_comment, ar_user, ar_user_text, ar_timestamp, ar_minor_edit,
ar_flags, ar_rev_id, if(ar_deleted&1,null,ar_text_id) as ar_text_id,
ar_deleted, if(ar_deleted&1,null,ar_len) as ar_len, ar_page_id, ar_parent_id,
if(ar_deleted&1,null,ar_sha1) as ar_sha1",
+ "where": "(ar_deleted&4)=0"
+ },
+ "filearchive": {
+ "source": "filearchive",
+ "view": "select fa_id, fa_name, fa_archive_name, fa_storage_group,
NULL as fa_storage_key, fa_deleted_user, fa_deleted_timestamp,
fa_deleted_reason, if(fa_deleted&1,null,fa_size) as fa_size,
if(fa_deleted&1,null,fa_width) as fa_width, if(fa_deleted&1,null,fa_height) as
fa_height, if(fa_deleted&1,null,fa_metadata) as fa_metadata,
if(fa_deleted&1,null,fa_bits) as fa_bits, if(fa_deleted&1,null,fa_media_type)
as fa_media_type, if(fa_deleted&1,null,fa_major_mime) as fa_major_mime,
if(fa_deleted&1,null,fa_minor_mime) as fa_minor_mime,
if(fa_deleted&2,null,fa_description) as fa_description,
if(fa_deleted&4,null,fa_user)as fa_user, if(fa_deleted&4,null,fa_user_text) as
fa_user_text, fa_timestamp, fa_deleted, if(fa_deleted&1,null,fa_sha1) as
fa_sha1"
+ },
+ "filearchive_userindex": {
+ "source": "filearchive",
+ "view": "select fa_id, fa_name, fa_archive_name, fa_storage_group,
NULL as fa_storage_key, fa_deleted_user, fa_deleted_timestamp,
fa_deleted_reason, if(fa_deleted&1,null,fa_size) as fa_size,
if(fa_deleted&1,null,fa_width) as fa_width, if(fa_deleted&1,null,fa_height) as
fa_height, if(fa_deleted&1,null,fa_metadata) as fa_metadata,
if(fa_deleted&1,null,fa_bits) as fa_bits, if(fa_deleted&1,null,fa_media_type)
as fa_media_type, if(fa_deleted&1,null,fa_major_mime) as fa_major_mime,
if(fa_deleted&1,null,fa_minor_mime) as fa_minor_mime,
if(fa_deleted&2,null,fa_description) as fa_description, fa_user, fa_user_text,
fa_timestamp, fa_deleted, if(fa_deleted&1,null,fa_sha1) as fa_sha1",
+ "where": "(fa_deleted&4)=0"
+ },
+ "globaluser": {
+ "source": "globaluser",
+ "view": "select gu_id, gu_name, gu_enabled, gu_enabled_method,
gu_home_db, NULL as gu_email, NULL as gu_email_authenticated, NULL as gu_salt,
NULL as gu_password, gu_locked, gu_hidden, gu_registration, NULL as
gu_password_reset_key, NULL as gu_password_reset_expiration, NULL as
gu_auth_token",
+ "where": "gu_hidden=''"
+ },
+ "ipblocks": {
+ "source": "ipblocks",
+ "view": "select ipb_id, if(ipb_auto<>0,null,ipb_address) as
ipb_address, ipb_user, ipb_by, ipb_reason, ipb_timestamp, ipb_auto,
ipb_anon_only, ipb_create_account, ipb_expiry,
if(ipb_auto<>0,null,ipb_range_start) as ipb_range_start,
if(ipb_auto<>0,null,ipb_range_end) as ipb_range_end, ipb_enable_autoblock,
ipb_deleted, ipb_block_email, ipb_by_text, ipb_allow_usertalk,
ipb_parent_block_id",
+ "where": "ipb_deleted=0"
+ },
+ "ipblocks_ipindex": {
+ "source": "ipblocks",
+ "view": "select ipb_id, ipb_address, ipb_user, ipb_by, ipb_reason,
ipb_timestamp, ipb_auto, ipb_anon_only, ipb_create_account, ipb_expiry,
ipb_range_start, ipb_range_end, ipb_enable_autoblock, ipb_deleted,
ipb_block_email, ipb_by_text, ipb_allow_usertalk, ipb_parent_block_id",
+ "where": "ipb_deleted=0 and ipb_auto=0"
+ },
+ "logging": {
+ "source": "logging",
+ "view": "select log_id, log_type,
if(log_deleted&1,null,log_action) as log_action, log_timestamp,
if(log_deleted&4,null,log_user) as log_user,
if(log_deleted&1,null,log_namespace) as log_namespace,
if(log_deleted&1,null,log_title) as log_title,
if(log_deleted&2,null,log_comment) as log_comment,
if(log_deleted,null,log_params) as log_params, log_deleted,
if(log_deleted&4,null,log_user_text) as log_user_text,
if(log_deleted&1,null,log_page) as log_page"
+ },
+ "logging_logindex": {
+ "source": "logging",
+ "view": "select log_id, log_type, log_action, log_timestamp,
if(log_deleted&4,null,log_user) as log_user, log_namespace, log_title,
if(log_deleted&2,null,log_comment) as log_comment,
if(log_deleted,null,log_params) as log_params, log_deleted,
if(log_deleted&4,null,log_user_text) as log_user_text, log_page"
+ },
+ "logging_userindex": {
+ "source": "logging",
+ "view": "select log_id, log_type,
if(log_deleted&1,null,log_action) as log_action, log_timestamp, log_user,
if(log_deleted&1,null,log_namespace) as log_namespace,
if(log_deleted&1,null,log_title) as log_title,
if(log_deleted&2,null,log_comment) as log_comment,
if(log_deleted,null,log_params) as log_params, log_deleted, log_user_text as
log_user_text, if(log_deleted&1,null,log_page) as log_page"
+ },
+ "mark_as_helpful": {
+ "source": "mark_as_helpful",
+ "view": "select mah_id, mah_type, mah_item, mah_user_id,
mah_user_editcount, mah_namespace, mah_title, mah_timestamp, NULL as
mah_system_type, NULL as mah_user_agent, NULL as mah_locale"
+ },
+ "oldimage": {
+ "source": "oldimage",
+ "view": "select oi_name, oi_archive_name, oi_size, oi_width,
oi_height, oi_bits, if(oi_deleted&2,null,oi_description) as oi_description,
if(oi_deleted&4,null,oi_user) as oi_user, if(oi_deleted&4,null,oi_user_text) as
oi_user_text, oi_timestamp, oi_metadata, oi_media_type, oi_major_mime,
oi_minor_mime, oi_deleted, oi_sha1"
+ },
+ "oldimage_userindex": {
+ "source": "oldimage",
+ "view": "select oi_name, oi_archive_name, oi_size, oi_width,
oi_height, oi_bits, if(oi_deleted&2,null,oi_description) as oi_description,
oi_user, oi_user_text, oi_timestamp, oi_metadata, oi_media_type, oi_major_mime,
oi_minor_mime, oi_deleted, oi_sha1",
+ "where": "(oi_deleted&4)=0"
+ },
+ "recentchanges": {
+ "source": "recentchanges",
+ "view": "select rc_id, rc_timestamp, if(rc_deleted&4,null,rc_user)
as rc_user, if(rc_deleted&4,null,rc_user_text) as rc_user_text, rc_namespace,
rc_title, if(rc_deleted&2,null,rc_comment) as rc_comment, rc_minor, rc_bot,
rc_new, rc_cur_id, rc_this_oldid, rc_last_oldid, rc_type, rc_source,
rc_patrolled, null as rc_ip, rc_old_len, rc_new_len, rc_deleted, rc_logid,
rc_log_type, rc_log_action, rc_params"
+ },
+ "recentchanges_userindex": {
+ "source": "recentchanges",
+ "view": "select rc_id, rc_timestamp, rc_user, rc_user_text,
rc_namespace, rc_title, if(rc_deleted&2,null,rc_comment) as rc_comment,
rc_minor, rc_bot, rc_new, rc_cur_id, rc_this_oldid, rc_last_oldid, rc_type,
rc_source, rc_patrolled, null as rc_ip, rc_old_len, rc_new_len, rc_deleted,
rc_logid, rc_log_type, rc_log_action, rc_params",
+ "where": "(rc_deleted&4)=0"
+ },
+ "revision": {
+ "source": "revision",
+ "view": "select rev_id, rev_page,
if(rev_deleted&1,null,rev_text_id) as rev_text_id,
if(rev_deleted&2,null,rev_comment) as rev_comment,
if(rev_deleted&4,null,rev_user) as rev_user,
if(rev_deleted&4,null,rev_user_text) as rev_user_text, rev_timestamp,
rev_minor_edit, rev_deleted, if(rev_deleted&1,null,rev_len) as rev_len,
rev_parent_id, if(rev_deleted&1,null,rev_sha1) as rev_sha1, rev_content_model,
rev_content_format"
+ },
+ "revision_userindex": {
+ "source": "revision",
+ "view": "select rev_id, rev_page,
if(rev_deleted&1,null,rev_text_id) as rev_text_id,
if(rev_deleted&2,null,rev_comment) as rev_comment, rev_user, rev_user_text,
rev_timestamp, rev_minor_edit, rev_deleted, if(rev_deleted&1,null,rev_len) as
rev_len, rev_parent_id, if(rev_deleted&1,null,rev_sha1) as rev_sha1,
rev_content_model, rev_content_format",
+ "where": "(rev_deleted&4)=0"
+ },
+ "user": {
+ "source": "user",
+ "view": "select user_id, user_name, user_real_name, NULL as
user_password, NULL as user_newpassword, NULL as user_email, NULL as
user_options, NULL as user_touched, NULL as user_token, NULL as
user_email_authenticated, NULL as user_email_token, NULL as
user_email_token_expires, user_registration, NULL as user_newpass_time,
user_editcount, NULL as user_password_expires"
+ },
+ "user_old": {
+ "source": "user_old",
+ "view": "select user_id, user_name, NULL as user_password, NULL as
user_newpassword, NULL as user_email, NULL as user_options, NULL as
user_newtalk, NULL as user_touched, user_real_name, NULL as user_token"
+ },
+ "user_properties": {
+ "source": "user_properties",
+ "view": "select up_user, up_property, up_value",
+ "where": "up_property in ( 'disablemail', 'fancysig', 'gender',
'language', 'nickname', 'skin', 'timecorrection', 'variant' )"
+ },
+ "user_properties_anon": {
+ "limit": 2,
+ "source": [ "user_properties", "user",
"meta_p.properties_anon_whitelist" ],
+ "view": "select cast(extract(year_month from user_touched)*100+1
as date) upa_touched, up_property, up_value",
+ "where": "user_id=up_user and up_property like pw_property"
+ },
+ "watchlist_counts": {
+ "source": "watchlist",
+ "view" : "select count(*) as wl_count, wl_namespace, wl_title",
+ "group" : "wl_namespace, wl_title having wl_count >= 30"
+ }
+ }
+}
\ No newline at end of file
diff --git a/maintain-replicas/maintain-replicas.py
b/maintain-replicas/maintain-replicas.py
new file mode 100644
index 0000000..d4f30a7
--- /dev/null
+++ b/maintain-replicas/maintain-replicas.py
@@ -0,0 +1,154 @@
+#! /usr/bin/python3
+# -*- coding: utf-8 -*-
+
+# Based on work by Marc-André Pelletier, ported to Python by Alex Monk
+# Copyright © 2016 Alex Monk <[email protected]>
+# Copyright © 2015 Alex Monk <[email protected]>
+# Copyright © 2013 Marc-André Pelletier <[email protected]>
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+##
+## This script maintains the databases containing sanitized views to
+## the replicated databases (in the form <db>_p for every <db>)
+##
+## By default, it processes every shard but it accepts a list of
+## databases to process
+##
+## The script expects to be invoked in a fresh copy of
+## operations/mediawiki-config where it will get most of its
+## information.
+##
+
+import argparse
+import json
+import logging
+import pymysql
+import re
+import subprocess
+import sys
+
+if __name__ == "__main__":
+ argparser = argparse.ArgumentParser('maintain-replicas',
+ description='Maintain labs sanitized
views of replica databases')
+ argparser.add_argument(
+ '--config-location',
+ help='Path to find the configuration file',
+ default='config.json'
+ )
+ args = argparser.parse_args()
+ with open(args.config_location) as f:
+ config = json.load(f)
+
+ # Hacks
+ safelog = "log_type IN ('" + "', '".join(config['logging_whitelist']) +
"')"
+ customviews = config['customviews']
+ customviews['logging']['where'] = safelog
+ customviews['logging_logindex']['where'] = "(log_deleted&1)=0 and " +
safelog
+ customviews['logging_userindex']['where'] = "(log_deleted&4)=0 and " +
safelog
+
+ logging.info("Running git pull in mediawiki-config")
+ subprocess.call(["git", "pull"], cwd = "mediawiki-config")
+ dbs = {db : {} for db in
open('mediawiki-config/dblists/all.dblist').read().splitlines()}
+ def read_list(list_fname, prop, val):
+ for db in open('mediawiki-config/dblists/' + list_fname +
'.dblist').read().splitlines():
+ if db in dbs:
+ dbs[db][prop] = val
+
+ read_list("deleted", "deleted", True)
+ read_list("private", "private", True)
+ read_list("small", "size", 1)
+ read_list("medium", "size", 2)
+ read_list("large", "size", 3)
+
+ logging.info("Got all necessary info, starting to connect to slices")
+ for dbhost, dbport in config['slices']:
+ dbh = pymysql.connect(
+ host=dbhost,
+ port=dbport,
+ user=config['mysql_user'],
+ passwd=config['mysql_password'],
+ charset='utf8'
+ )
+ cursor = dbh.cursor()
+ logging.info("Connected to {}:{}...".format(dbhost, dbport))
+ cursor.execute("SET NAMES 'utf8';")
+ for db, db_info in dbs:
+ if 'deleted' in db_info and db_info['deleted']:
+ continue
+ if 'private' in db_info and db_info['private']:
+ continue
+ cursor.execute("CREATE DATABASE IF NOT EXISTS {}_p;".format(db))
+
+ logging.info("Full views for {}:".format(db))
+ for view in config['fullviews']:
+ cursor.execute("""
+ SELECT table_name
+ FROM information_schema.tables
+ WHERE table_name='{}' AND table_schema='{}';
+ """.format(view, db))
+ result, = cursor.fetchone()
+ if result == 1:
+ logging.info("[{}] ".format(view))
+ cursor.execute("""
+ CREATE OR REPLACE
+ DEFINER=viewmaster
+ VIEW {0}_p.{1}
+ AS SELECT * FROM {0}.{1};
+ """.format(db, view))
+
+
+ logging.info("Custom views for {}:".format(db))
+ for view_name, view_details in customviews.items():
+ if view_details['limit'] > db_info['size']:
+ continue
+
+ sources = list(view_details['source'])
+ sources_checked = []
+
+ for source in sources:
+ match = re.match(r'^(?:(.*)\.)?([^.]+)$', source)
+ if not match:
+ # TODO: Do we really want to do this?
+ logging.critical("Custom view source does not look
valid! Source: {}, view: {}".format(source, view_name))
+ sys.exit(1)
+ source_db, source_table = match.groups()
+ if source_db is None:
+ source_db = db
+ cursor.execute("""
+ SELECT table_name
+ FROM information_schema.tables
+ WHERE table_name='{}' AND table_schema='{}';
+ """.format(table, db))
+ result, = cursor.fetchone()
+ if result == 1:
+ sources_checked.append("{}.{}".format(source_db,
source_table))
+
+ if len(sources) != len(sources_checked):
+ continue
+
+ query = """
+ CREATE OR REPLACE
+ DEFINER=viewmaster
+ VIEW ${dbk}_p.$view
+ AS {}
+ FROM {}
+ """.format(db, customview['view'], ','.join(sources))
+
+ if 'where' in view_details:
+ query += " WHERE {}\n".format(view_details['where'])
+ if 'group' in view_details:
+ query += "GROUP BY {}\n".format(view_details['group'])
+ cursor.execute("{};".format(query))
+
+logging.info("All done.")
\ No newline at end of file
--
To view, visit https://gerrit.wikimedia.org/r/295607
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Icc51fc9a23f2fcf4c19c7adc092662d7122471e1
Gerrit-PatchSet: 1
Gerrit-Project: operations/software
Gerrit-Branch: master
Gerrit-Owner: Alex Monk <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits