ArielGlenn has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/394763 )
Change subject: simplify cleanup of old xml/sql dumps
......................................................................
simplify cleanup of old xml/sql dumps
Don't bother retrieving a list of all wikis, just walk through all
the immediate subdirs of the dumps tree and process those with
dirs that look like dump runs (names YYYYMMDD)
Since there's no all.dblist any more, have a default entry
in the conf file for the number of dumps to keep
Bug: T181895
Change-Id: Ic62c381f087c4c018b261f85b8b6db81a3f15ea0
---
M modules/dumps/files/web/cleanups/cleanup_old_xmldumps.py
R modules/dumps/manifests/web/cleanups/xmldumps.pp
M modules/dumps/manifests/web/xmldumps_active.pp
3 files changed, 64 insertions(+), 44 deletions(-)
Approvals:
ArielGlenn: Looks good to me, approved
jenkins-bot: Verified
diff --git a/modules/dumps/files/web/cleanups/cleanup_old_xmldumps.py
b/modules/dumps/files/web/cleanups/cleanup_old_xmldumps.py
index b58294d..b268257 100644
--- a/modules/dumps/files/web/cleanups/cleanup_old_xmldumps.py
+++ b/modules/dumps/files/web/cleanups/cleanup_old_xmldumps.py
@@ -11,6 +11,7 @@
import sys
import getopt
import shutil
+import re
def usage(message=None):
@@ -24,13 +25,13 @@
sys.stderr.write("\n")
usage_message = """
Usage: cleanup_old_xmldumps.py --keeps_conffile path --wikilists dir
- [--help]
+ [--subdirs] [--dryrun] [--help]
Given a directory with files with lists of wikis,
settings in a file describing how many dumps to keep
for wikis in each list, and the path of the directory
tree of the dumps, looks through each directory
-treepath/wiki/ to be sure that there are no more than
+treepath/wikiname/ to be sure that there are no more than
the specified number of dump directories. Dump directories
are subdirectories with the format YYYYMMDD; the rest are
ignored.
@@ -45,6 +46,12 @@
dumps per wiki we keep, for each file
containing a list of wikis
--wiki (-w): wiki for which to dump config settings
+ --subdirs (-s): directories in treepath must
+ match this expression in order to
+ be examined and cleaned up.
+ default: '*wik*'
+ --dryrun (-D): don't remove anything, display what
+ would be removed
--help (-h): display this usage message
File formats:
@@ -56,7 +63,8 @@
dumps to keep. Example: enwiki:3
Note that blank lines or lines starting with '#' in both types
of files will be skipped.
-
+ An entry 'default:number' will be the value used for any wikis
+ not in one of the specified lists.
Example:
cleanup_old_xmldumps.py -k /etc/dumps/xml_keeps.conf \
-d /mnt/dumpsdata/xmldatadumps/public -w /etc/dumps/dblists
@@ -65,7 +73,7 @@
sys.exit(1)
-def get_wikilists(wikilists_dir):
+def get_wikilists(wikilists_dir, knownlists):
"""
read lists of wikis from files in a specified dir,
skipping comments and blank lines, return dict of
@@ -74,10 +82,22 @@
wikilists = {}
files = os.listdir(wikilists_dir)
for filename in files:
- with open(os.path.join(wikilists_dir, filename), "r") as fhandle:
- lines = fhandle.readlines()
- wikilists[filename] = [line.strip() for line in lines if line and not
line.startswith('#')]
+ if filename in knownlists:
+ with open(os.path.join(wikilists_dir, filename), "r") as fhandle:
+ lines = fhandle.readlines()
+ wikilists[filename] = [line.strip() for line in lines
+ if line and not line.startswith('#')]
return wikilists
+
+
+def get_allwikis(dumpsdir, match):
+ """
+ return list of all subdirectories of dumpsdir matching the
+ supplied regular expression
+ """
+ subdirs = os.listdir(dumpsdir)
+ return [subdir for subdir in subdirs
+ if re.match(match, subdir) and
os.path.isdir(os.path.join(dumpsdir, subdir))]
def get_keeps(keeps_conffile):
@@ -109,21 +129,37 @@
"""
methods for finding and cleaning up old wiki dump dirs
"""
- def __init__(self, keeps_conffile, wikilists_dir, dumpsdir, dryrun):
+ def __init__(self, keeps_conffile, wikilists_dir, dumpsdir, wikipattern,
dryrun):
self.keeps_per_list = get_keeps(keeps_conffile)
- self.wikilists = get_wikilists(wikilists_dir)
+ self.wikilists = get_wikilists(wikilists_dir,
self.keeps_per_list.keys())
self.dumpsdir = dumpsdir
+ self.wikistoclean = get_allwikis(self.dumpsdir, wikipattern)
self.dryrun = dryrun
def get_dumps(self, wiki):
"""
get list of subdirs for dumpsdir/wiki/ in format YYYYDDMM
+ all other subdirs are skipped.
"""
path = os.path.join(self.dumpsdir, wiki)
if not os.path.exists(path):
return []
dirs = os.listdir(path)
return sorted([dirname for dirname in dirs if dirname.isdigit() and
len(dirname) == 8])
+
+ def get_keep_for_wiki(self, wiki):
+ """
+ find the keep value for the specified wiki
+ by checking through the various keep conf settings,
+ falling back to the default setting, if there is one
+ if no setting is found, return None
+ """
+ for wikilist in self.wikilists:
+ if wiki in self.wikilists[wikilist]:
+ return self.keeps_per_list[wikilist]
+ if 'default' in self.keeps_per_list:
+ return self.keeps_per_list['default']
+ return None
def cleanup_dirs(self, wiki, dirs):
"""
@@ -137,27 +173,26 @@
else:
shutil.rmtree("%s" % to_remove)
- def cleanup_wikis(self, keeps, wikis):
+ def cleanup_wiki(self, keeps, wiki):
"""
- for each wiki in the list, remove oldest dumps if we have
- more than the number to keep
+ remove oldest dumps if we have more than the number to keep
"""
- for wiki in wikis:
- dumps = self.get_dumps(wiki)
- if len(dumps) > int(keeps):
- self.cleanup_dirs(wiki, dumps[0:len(dumps) - int(keeps)])
+ dumps = self.get_dumps(wiki)
+ if len(dumps) > int(keeps):
+ self.cleanup_dirs(wiki, dumps[0:len(dumps) - int(keeps)])
def clean(self):
"""
remove oldest dumps from each wiki directory in dumpsdir
if we are keeping too many, as determined by the conffile
describing how many we keep for wikis in each list in the
- wikilists_dir.
+ wikilists_dir, with optional default keep value.
"""
- for listname in self.wikilists:
- if listname not in self.keeps_per_list:
+ for wiki in self.wikistoclean:
+ tokeep = self.get_keep_for_wiki(wiki)
+ if tokeep is None:
continue
- self.cleanup_wikis(self.keeps_per_list[listname],
self.wikilists[listname])
+ self.cleanup_wiki(tokeep, wiki)
def check_args(args, remainder):
@@ -177,13 +212,14 @@
keeps_conffile = None
dumpsdir = None
+ subdirs = '[a-z0-9]*wik[a-z0-9]*'
wikilists_dir = None
dryrun = False
try:
(options, remainder) = getopt.gnu_getopt(
- sys.argv[1:], "d:k:w:Dh", ["dumpsdir=", "keep=", "wikilists=",
- "dryrun", "help"])
+ sys.argv[1:], "d:k:s:w:Dh",
+ ["dumpsdir=", "keep=", "subdirs=", "wikilists=", "dryrun", "help"])
except getopt.GetoptError as err:
usage("Unknown option specified: " + str(err))
@@ -193,6 +229,8 @@
dumpsdir = val
elif opt in ["-k", "--keep"]:
keeps_conffile = val
+ elif opt in ["-s", "--subdirs"]:
+ subdirs = val
elif opt in ["-w", "--wikilists"]:
wikilists_dir = val
elif opt in ["-D", "--dryrun"]:
@@ -208,7 +246,7 @@
if not os.path.exists(keeps_conffile):
usage("no such file found: " + keeps_conffile)
- cleaner = DumpsCleaner(keeps_conffile, wikilists_dir, dumpsdir, dryrun)
+ cleaner = DumpsCleaner(keeps_conffile, wikilists_dir, dumpsdir, subdirs,
dryrun)
cleaner.clean()
diff --git a/modules/dumps/manifests/web/cleanups/xml_cleanup.pp
b/modules/dumps/manifests/web/cleanups/xmldumps.pp
similarity index 74%
rename from modules/dumps/manifests/web/cleanups/xml_cleanup.pp
rename to modules/dumps/manifests/web/cleanups/xmldumps.pp
index 1403004..9784525 100644
--- a/modules/dumps/manifests/web/cleanups/xml_cleanup.pp
+++ b/modules/dumps/manifests/web/cleanups/xmldumps.pp
@@ -1,4 +1,4 @@
-class dumps::web::cleanups::xml_cleanup(
+class dumps::web::cleanups::xmldumps(
$wikilist_url = undef,
$publicdir = undef,
$user = undef,
@@ -44,9 +44,7 @@
# less, so that when a new dump run starts and partial dumps are
# copied over to the web server, space is available for that new
# run BEFORE it is copied.
- # Some wikis in the allwikis list may be also in one of the
- # other lists. That's fine, the smaller number to keep always wins.
- $keeps = ['hugewikis.dblist:7', 'bigwikis.dblist:8', 'allwikis.dblist:10']
+ $keeps = ['hugewikis.dblist:7', 'bigwikis.dblist:8', 'default:10']
$keeps_content = join($keeps, "\n")
file { '/etc/dumps/xml_keeps.conf':
@@ -56,22 +54,6 @@
owner => 'root',
group => 'root',
content => "${keeps_content}\n",
- }
-
- # get and save the list of all wikis.
- # private or nonexistent wikis can be skipped by the cleanup script so
- # we don't filter the list here.
- $curl_command = '/usr/bin/curl --connect-timeout 5 -s --retry 5
--retry-delay 10'
- $curl_output = "${wikilist_dir}/allwikis.dblist"
- $curl_args = "-z ${curl_output} -o ${curl_output} '${wikilist_url}'"
-
- cron { 'get_wiki_list':
- ensure => 'present',
- environment => '[email protected]',
- command => "${curl_command} ${curl_args}",
- user => $user,
- minute => '20',
- hour => '1',
}
file { '/usr/local/bin/cleanup_old_xmldumps.py':
diff --git a/modules/dumps/manifests/web/xmldumps_active.pp
b/modules/dumps/manifests/web/xmldumps_active.pp
index 75fb517..82ccc48 100644
--- a/modules/dumps/manifests/web/xmldumps_active.pp
+++ b/modules/dumps/manifests/web/xmldumps_active.pp
@@ -31,7 +31,7 @@
# only the active web server needs to cleanup old files
# rsync between peers will take care of the other hosts
- class {'::dumps::web::cleanups::xml_cleanup':
+ class {'::dumps::web::cleanups::xmldumps':
wikilist_url => $wikilist_url,
publicdir => $publicdir,
user => $user,
--
To view, visit https://gerrit.wikimedia.org/r/394763
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ic62c381f087c4c018b261f85b8b6db81a3f15ea0
Gerrit-PatchSet: 5
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: Volans <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits