worker.py

ariel Mon, 12 Sep 2011 00:38:25 -0700

http://www.mediawiki.org/wiki/Special:Code/MediaWiki/96826


Revision: 96826
Author:   ariel
Date:     2011-09-12 07:38:15 +0000 (Mon, 12 Sep 2011)
Log Message:
-----------
remove some dead fixmes, add _chunkToDo to base Dump class, seriously cleanup 
linking to rss feed files and removal of old latest links, this were pretty 
broken after checkpoint files went in

Modified Paths:
--------------
    branches/ariel/xmldumps-backup/worker.py

Modified: branches/ariel/xmldumps-backup/worker.py
===================================================================
--- branches/ariel/xmldumps-backup/worker.py    2011-09-12 02:10:11 UTC (rev 
96825)
+++ branches/ariel/xmldumps-backup/worker.py    2011-09-12 07:38:15 UTC (rev 
96826)
@@ -1792,6 +1792,35 @@
                dumpFile = DumpFilename(self.wiki, None, 
self.checksums.getChecksumFileNameBasename())
                self.symLinks.saveSymlink(dumpFile)
                self.symLinks.cleanupSymLinks()
+
+               for item in self.dumpItemList.dumpItems:
+                       dumpNames = item.getDumpName()
+                       if type(dumpNames).__name__!='list':
+                               dumpNames = [ dumpNames ]
+
+                       if (item._chunksEnabled):
+                               # if there is a specific chunk, we want to only 
clear out
+                               # old files for that piece, because new files 
for the other
+                               # pieces may not have been generated yet.
+                               chunk = item._chunkToDo
+                       else:
+                               chunk = None
+
+                       checkpoint = None
+                       if (item._checkpointsEnabled):
+                               if (item.checkpointFile):
+                                       # if there's a specific checkpoint file 
we are
+                                       # rerunning, we would only clear out 
old copies
+                                       # of that very file. meh. how likely is 
it that we 
+                                       # have one? these files are time based 
and the start/end pageids
+                                       # are going to fluctuate. whatever
+                                       cf = DumpFilename(self.wiki)
+                                       cf.newFromFilename(item.checkpointFile)
+                                       checkpoint = cf.checkpoint
+
+                       for d in dumpNames:
+                               
self.symLinks.removeSymLinksFromOldRuns(self.wiki.date, d, chunk, checkpoint )
+
                self.feeds.cleanupFeeds()
 
        def makeDir(self, dir):
@@ -1826,10 +1855,15 @@
                        link = os.path.join(self.dumpDir.latestDir(), 
latestFilename)
                        if exists(link) or os.path.islink(link):
                                if os.path.islink(link):
-                                       realfile = os.readlink(link)
+                                       oldrealfile = os.readlink(link)
                                        # format of these links should be...  
../20110228/elwikidb-20110228-templatelinks.sql.gz
                                        rellinkpattern = 
re.compile('^\.\./(20[0-9]+)/')
-                                       dateinterval = int(self.wiki.date) - 
int(dumpFile.date)
+                                       dateinlink = 
rellinkpattern.search(oldrealfile)
+                                       if (dateinlink):
+                                               dateoflinkedfile = 
dateinlink.group(1)
+                                               dateinterval = 
int(self.wiki.date) - int(dateoflinkedfile)
+                                       else:
+                                               dateinterval = 0
                                        # no file or it's older than ours... 
*then* remove the link
                                        if not exists(os.path.realpath(link)) 
or dateinterval > 0:
                                                self.debugfn("Removing old 
symlink %s" % link)
@@ -1854,6 +1888,34 @@
                                        if not 
exists(os.path.join(latestDir,realfile)):
                                                os.remove(link)
 
+       # if the args are False or None, we remove all the old links for all 
values of the arg.
+       # example: if chunk is False or None then we remove all old values for 
all chunks
+       # "old" means "older than the specified datestring".
+       def removeSymLinksFromOldRuns(self, dateString, dumpName=None, 
chunk=None, checkpoint=None):
+               # fixme this needs to do more work if there are chunks or 
checkpoint files linked in here from 
+               # earlier dates. checkpoint ranges change, and configuration of 
chunks changes too, so maybe
+               # old files still exist and the links need to be removed 
because we have newer files for the
+               # same phase of the dump.
+               if (self._enabled):
+                       latestDir = self.dumpDir.latestDir()
+                       files = os.listdir(latestDir)
+                       for f in files:
+                               link = os.path.join(latestDir,f)
+                               if os.path.islink(link):
+                                       realfile = os.readlink(link)
+                                       fileObj = 
DumpFilename(self.dumpDir._wiki)
+                                       
fileObj.newFromFilename(os.path.basename(realfile))
+                                       if fileObj.date < dateString:
+                                               # fixme check that these are ok 
if the value is None
+                                               if dumpName and 
(fileObj.dumpName != dumpName):
+                                                       continue
+                                               if chunk and (fileObj.chunk != 
chunk):
+                                                       continue
+                                               if checkpoint and 
(fileObj.checkpoint != checkpoint):
+                                                       continue
+                                               self.debugfn("Removing old 
symlink %s -> %s" % (link, realfile))
+                                               os.remove(link)
+
 class Feeds(object):
        def __init__(self, wiki, dumpDir, dbName, debugfn, enabled):
                self.wiki = wiki
@@ -1884,7 +1946,8 @@
                                "description": xmlEscape("<a 
href=\"%s\">%s</a>" % (filenameAndPath, fileObj.filename)),
                                "date": time.strftime("%a, %d %b %Y %H:%M:%S 
GMT", time.gmtime()) }
                        directory = self.dumpDir.latestDir()
-                       rssPath = os.path.join(self.dumpDir.latestDir(), 
fileObj.basename + "-rss.xml")
+                       rssPath = os.path.join(self.dumpDir.latestDir(), 
self.dbName + "-latest-" + fileObj.basename + "-rss.xml")
+                       self.debugfn( "adding rss feed file %s " % rssPath )
                        FileUtils.writeFile(self.wiki.config.tempDir, rssPath, 
rssText, self.wiki.config.fileperms)
 
        def cleanupFeeds(self):
@@ -1896,9 +1959,10 @@
                        files = os.listdir(latestDir)
                        for f in files:
                                if f.endswith("-rss.xml"):
-                                       filename = f[:-8];
-                                       link = os.path.join(latestDir,f)
+                                       filename = f[:-8]
+                                       link = os.path.join(latestDir,filename)
                                        if not exists(link):
+                                               self.debugfn("Removing old rss 
feed %s for link %s" % (os.path.join(latestDir,f), link))
                                                
os.remove(os.path.join(latestDir,f))
 
 class Dump(object):
@@ -1919,6 +1983,8 @@
                        self._checkpointsEnabled = False
                if not hasattr(self, 'checkpointFile'):
                        self.checkpointFile = False
+               if not hasattr(self, '_chunkToDo'):
+                       self._chunkToDo = False
 
        def name(self):
                return self.runInfo.name()
@@ -1996,7 +2062,6 @@
                                runner.log.addToLogQueue(line)
                        sys.stderr.write(line)
                self.progress = line.strip()
-               # FIXME test this a lot!! does the updateStatus work?
                runner.status.updateStatusFiles()
                
runner.runInfoFile.saveDumpRunInfoFile(runner.dumpItemList.reportDumpRunInfo())
 
@@ -2392,7 +2457,6 @@
                        
files.extend(self.listCheckpointFilesPerChunkExisting(dumpDir, 
self.getChunkList(), dumpNames))
                        
files.extend(self.listTempFilesPerChunkExisting(dumpDir, self.getChunkList(), 
dumpNames))
                else:
-                       # fixme this should be a list
                        # we will pass list of chunks or chunkToDo, or False, 
depending on the job setup.
                        
files.extend(self.listRegularFilesPerChunkExisting(dumpDir, 
self.getChunkList(), dumpNames))
                return files
@@ -2532,7 +2596,6 @@
                return files
 
        def listOutputFilesForCleanup(self, dumpDir):
-               # fixme should this pass a list instead of one item? 
                dumpNames =  self.listDumpNames()
                files = []
                files.extend(Dump.listOutputFilesForCleanup(self, dumpDir, 
dumpNames))
@@ -2549,7 +2612,6 @@
                if (not exists( runner.wiki.config.php ) ):
                        raise BackupError("php command %s not found" % 
runner.wiki.config.php)
 
-               # fixme we have a list of all the files for all three 
dumpNames, we want to split them up by dumpName. oops.
                articlesFile = runner.dumpDir.filenamePublicPath(f)
                historyFile = 
runner.dumpDir.filenamePublicPath(DumpFilename(runner.wiki, f.date, 
self.historyDumpName, f.fileType, f.fileExt, f.chunk, f.checkpoint, f.temp))
                currentFile = 
runner.dumpDir.filenamePublicPath(DumpFilename(runner.wiki, f.date, 
self.currentDumpName, f.fileType, f.fileExt, f.chunk, f.checkpoint, f.temp))
@@ -2724,7 +2786,6 @@
                self.cleanupOldFiles(runner.dumpDir)
                # just get the files pertaining to our dumpName, which is *one* 
of articles, pages-current, pages-history.
                # stubs include all of them together.
-               # FIXME this needs some other jobname here. uuuggghhh and how 
would this job know what that is? bah
                if not self.dumpName.startswith(self.getDumpNameBase()):
                        raise BackupError("dumpName %s of unknown form for this 
job" % self.dumpName)
                dumpName = self.dumpName[len(self.getDumpNameBase()):]
@@ -2822,7 +2883,6 @@
 
                if (self.checkpointFile):
                        outputFile = f
-                       print "outputFile is ", outputFile.filename
                elif (self._checkpointsEnabled):
                        # we write a temp file, it will be checkpointed every 
so often.
                        outputFile = DumpFilename(self.wiki, f.date, 
self.dumpName, f.fileType, self.fileExt, f.chunk, f.checkpoint, temp = True)
@@ -2935,11 +2995,10 @@
                        for fileObj in fileList:
                                firstPageIdInFile = int(fileObj.firstPageID)
 
-                               # fixme what do we do here? this could be very 
expensive. is that
-                               # worth it??
+                               # fixme what do we do here? this could be very 
expensive. is that worth it??
                                if not fileObj.lastPageID:
                                        # (b) nasty hack, see (a)
-                                       # it's not a chekcpoint fle or we'd 
have the pageid in the filename
+                                       # it's not a checkpoint fle or we'd 
have the pageid in the filename
                                        # so... temporary hack which will give 
expensive results
                                        # if chunk file, and it's the last 
chunk, put none
                                        # if it's not the last chunk, get the 
first pageid in the next chunk and subtract 1
@@ -3184,7 +3243,6 @@
                        
files.extend(self.listCheckpointFilesPerChunkExisting(dumpDir, 
self.getChunkList(), dumpNames))
                        
files.extend(self.listTempFilesPerChunkExisting(dumpDir, self.getChunkList(), 
dumpNames))
                else:
-                       # fixme this should be a list
                        # we will pass list of chunks or chunkToDo, or False, 
depending on the job setup.
                        
files.extend(self.listRegularFilesPerChunkExisting(dumpDir, 
self.getChunkList(), dumpNames))
                return files
@@ -3264,7 +3322,6 @@
                return "xml"
 
        def getFileExt(self):
-               # fixme no file extension, see what this means for everything
                return ""
 
         def buildCommand(self, runner, f):
@@ -3396,7 +3453,6 @@
 
        def run(self, runner):
                error = 0
-               # FIXME check this code
                files = 
self.itemForRecombine.listOutputFilesForInput(runner.dumpDir)
                outputFileList = 
self.listOutputFilesForBuildCommand(runner.dumpDir)
                for outputFile in outputFileList:


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

[MediaWiki-CVS] SVN: [96826] branches/ariel/xmldumps-backup/worker.py

Reply via email to