ArielGlenn has uploaded a new change for review. https://gerrit.wikimedia.org/r/242464
Change subject: utils.py: pylint, fix many camelcase names. worker.py, fix indent issue ...................................................................... utils.py: pylint, fix many camelcase names. worker.py, fix indent issue Change-Id: I9c5104fa802d4bf1b85c4c61bd6f6419fa7e04f1 --- M xmldumps-backup/dumps/jobs.py M xmldumps-backup/dumps/utils.py M xmldumps-backup/worker.py M xmldumps-backup/xmlstreams.py 4 files changed, 220 insertions(+), 220 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/dumps refs/changes/64/242464/1 diff --git a/xmldumps-backup/dumps/jobs.py b/xmldumps-backup/dumps/jobs.py index 48c5e5a..75f599c 100644 --- a/xmldumps-backup/dumps/jobs.py +++ b/xmldumps-backup/dumps/jobs.py @@ -59,18 +59,18 @@ def updated(self): return self.runInfo.updated() - def toBeRun(self): - return self.runInfo.toBeRun() + def to_run(self): + return self.runInfo.to_run() def setName(self, name): self.runInfo.setName(name) - def setToBeRun(self, toBeRun): - self.runInfo.setToBeRun(toBeRun) + def set_to_run(self, to_run): + self.runInfo.set_to_run(to_run) def setSkipped(self): self.setStatus("skipped") - self.setToBeRun(False) + self.set_to_run(False) # sometimes this will be called to fill in data from an old # dump run; in those cases we don't want to clobber the timestamp @@ -189,7 +189,7 @@ sys.stderr.write(line) self.progress = line.strip() runner.status.update_status_files() - runner.runInfoFile.saveDumpRunInfoFile(runner.dumpItemList.report_dump_runinfo()) + runner.runInfoFile.save_dump_runinfo_file(runner.dumpItemList.report_dump_runinfo()) def timeToWait(self): # we use wait this many secs for a command to complete that @@ -650,7 +650,7 @@ """Dump a table from the current DB with mysqldump, save to a gzipped sql file.""" if not exists(runner.wiki.config.gzip): raise BackupError("gzip command %s not found" % runner.wiki.config.gzip) - commands = runner.dbServerInfo.buildSqlDumpCommand(table, runner.wiki.config.gzip) + commands = runner.dbServerInfo.build_sqldump_command(table, runner.wiki.config.gzip) return runner.save_command(commands, outfile) class PrivateTable(PublicTable): @@ -1228,7 +1228,7 @@ continue # see if this job from that date was successful - if not runner.runInfoFile.statusOfOldDumpIsDone(runner, date, self.name(), self._desc): + if not runner.runInfoFile.status_of_old_dump_is_done(runner, date, self.name(), self._desc): runner.debug("skipping incomplete or failed dump for prefetch date %s" % date) continue diff --git a/xmldumps-backup/dumps/utils.py b/xmldumps-backup/dumps/utils.py index 02d020e..0bfbc40 100644 --- a/xmldumps-backup/dumps/utils.py +++ b/xmldumps-backup/dumps/utils.py @@ -12,21 +12,21 @@ from dumps.exceptions import BackupError class MultiVersion(object): - def MWScriptAsString(config, maintenanceScript): - return " ".join(MultiVersion.MWScriptAsArray(config, maintenanceScript)) + def MWScriptAsString(config, maintenance_script): + return " ".join(MultiVersion.MWScriptAsArray(config, maintenance_script)) - def MWScriptAsArray(config, maintenanceScript): - MWScriptLocation = os.path.join(config.wikiDir, "multiversion", "MWScript.php") - if exists(MWScriptLocation): - return [MWScriptLocation, maintenanceScript] + def MWScriptAsArray(config, maintenance_script): + mw_script_location = os.path.join(config.wikiDir, "multiversion", "MWScript.php") + if exists(mw_script_location): + return [mw_script_location, maintenance_script] else: - return ["%s/maintenance/%s" % (config.wikiDir, maintenanceScript)] + return ["%s/maintenance/%s" % (config.wikiDir, maintenance_script)] - def MWVersion(config, dbName): - getVersionLocation = os.path.join(config.wikiDir, "multiversion", "getMWVersion") - if exists(getVersionLocation): + def MWVersion(config, db_name): + get_version_location = os.path.join(config.wikiDir, "multiversion", "getMWVersion") + if exists(get_version_location): # run the command for the wiki and get the version - command = getVersionLocation + " " + dbName + command = get_version_location + " " + db_name version = RunSimpleCommand.runAndReturn(command) if version: version = version.rstrip() @@ -38,57 +38,57 @@ MWVersion = staticmethod(MWVersion) class DbServerInfo(object): - def __init__(self, wiki, dbName, errorCallback=None): + def __init__(self, wiki, db_name, error_callback=None): self.wiki = wiki - self.dbName = dbName - self.errorCallback = errorCallback - self.dBTablePrefix = None - self.getDefaultServerAndDBprefix() + self.dbName = db_name + self.errorCallback = error_callback + self.db_table_prefix = None + self.get_db_server_and_prefix() - def getDefaultServerAndDBprefix(self): + def get_db_server_and_prefix(self): """Get the name of a slave server for our cluster; also get the prefix for all tables for the specific wiki ($wgDBprefix)""" if not exists(self.wiki.config.php): raise BackupError("php command %s not found" % self.wiki.config.php) - commandList = MultiVersion.MWScriptAsArray(self.wiki.config, "getSlaveServer.php") - phpCommand = MiscUtils.shellEscape(self.wiki.config.php) - dbName = MiscUtils.shellEscape(self.dbName) - for i in range(0, len(commandList)): - commandList[i] = MiscUtils.shellEscape(commandList[i]) - command = " ".join(commandList) - command = "%s -q %s --wiki=%s --group=dump --globals" % (phpCommand, command, dbName) + command_list = MultiVersion.MWScriptAsArray(self.wiki.config, "getSlaveServer.php") + php_command = MiscUtils.shellEscape(self.wiki.config.php) + db_name = MiscUtils.shellEscape(self.dbName) + for i in range(0, len(command_list)): + command_list[i] = MiscUtils.shellEscape(command_list[i]) + command = " ".join(command_list) + command = "%s -q %s --wiki=%s --group=dump --globals" % (php_command, command, db_name) results = RunSimpleCommand.runAndReturn(command, self.errorCallback).strip() if not results: raise BackupError("Failed to get database connection information for %s, bailing." % self.wiki.config.php) # first line is the server, the second is an array of the globals, we need the db table prefix out of those lines = results.splitlines() self.dbServer = lines[0] - self.dbPort = None + self.db_port = None if ':' in self.dbServer: - self.dbServer, _, self.dbPort = self.dbServer.rpartition(':') + self.dbServer, _, self.db_port = self.dbServer.rpartition(':') # [wgDBprefix] => - wgdbprefixPattern = re.compile("\s+\[wgDBprefix\]\s+=>\s+(?P<prefix>.*)$") - for l in lines: - match = wgdbprefixPattern.match(l) + wgdb_prefix_pattern = re.compile("\s+\[wgDBprefix\]\s+=>\s+(?P<prefix>.*)$") + for line in lines: + match = wgdb_prefix_pattern.match(line) if match: - self.dBTablePrefix = match.group('prefix').strip() - if self.dBTablePrefix == None: + self.db_table_prefix = match.group('prefix').strip() + if self.db_table_prefix == None: # if we didn't see this in the globals list, something is broken. raise BackupError("Failed to get database table prefix for %s, bailing." % self.wiki.config.php) - def mysqlStandardParameters(self): + def mysql_standard_parameters(self): host = self.dbServer - if self.dbPort and self.dbServer.strip() == "localhost": + if self.db_port and self.dbServer.strip() == "localhost": # MySQL tools ignore port settings for host "localhost" and instead use IPC sockets, # so we rewrite the localhost to it's ip address host = socket.gethostbyname(self.dbServer); params = ["-h", "%s" % host] # Host - if self.dbPort: - params += ["--port", "%s" % self.dbPort] # Port + if self.db_port: + params += ["--port", "%s" % self.db_port] # Port params += ["-u", "%s" % self.wiki.config.dbUser] # Username - params += ["%s" % self.passwordOption()] # Password + params += ["%s" % self.password_option()] # Password return params def buildSqlCommand(self, query, pipeto=None): @@ -96,38 +96,38 @@ if not exists(self.wiki.config.mysql): raise BackupError("mysql command %s not found" % self.wiki.config.mysql) command = [["/bin/echo", "%s" % query], - ["%s" % self.wiki.config.mysql] + self.mysqlStandardParameters() + [ - "%s" % self.dbName, - "-r"]] + ["%s" % self.wiki.config.mysql] + self.mysql_standard_parameters() + [ + "%s" % self.dbName, + "-r"]] if pipeto: command.append([pipeto]) return command - def buildSqlDumpCommand(self, table, pipeto=None): + def build_sqldump_command(self, table, pipeto=None): """Put together a command to dump a table from the current DB with mysqldump and save to a gzipped sql file.""" if not exists(self.wiki.config.mysqldump): raise BackupError("mysqldump command %s not found" % self.wiki.config.mysqldump) - command = [["%s" % self.wiki.config.mysqldump] + self.mysqlStandardParameters() + [ - "--opt", "--quick", - "--skip-add-locks", "--skip-lock-tables", - "%s" % self.dbName, - "%s" % self.dBTablePrefix + table]] + command = [["%s" % self.wiki.config.mysqldump] + self.mysql_standard_parameters() + [ + "--opt", "--quick", + "--skip-add-locks", "--skip-lock-tables", + "%s" % self.dbName, + "%s" % self.db_table_prefix + table]] if pipeto: command.append([pipeto]) return command - def runSqlAndGetOutput(self, query): + def run_sql_and_get_output(self, query): command = self.buildSqlCommand(query) - p = CommandPipeline(command, quiet=True) - p.runPipelineAndGetOutput() + proc = CommandPipeline(command, quiet=True) + proc.runPipelineAndGetOutput() # fixme best to put the return code someplace along with any errors.... - if p.exitedSuccessfully() and (p.output()): - return p.output() + if proc.exitedSuccessfully() and (proc.output()): + return proc.output() else: return None - def passwordOption(self): + def password_option(self): """If you pass '-pfoo' mysql uses the password 'foo', but if you pass '-p' it prompts. Sigh.""" if self.wiki.config.dbPassword == "": @@ -136,7 +136,7 @@ return "-p" + self.wiki.config.dbPassword class RunSimpleCommand(object): - def runAndReturn(command, logCallback=None): + def runAndReturn(command, log_callback=None): """Run a command and return the output as a string. Raises BackupError on non-zero return code.""" retval = 1 @@ -146,16 +146,16 @@ output, error = proc.communicate() retval = proc.returncode while retval and retries < maxretries: - if logCallback: - logCallback("Non-zero return code from '%s'" % command) + if log_callback: + log_callback("Non-zero return code from '%s'" % command) time.sleep(5) proc = Popen(command, bufsize=64, shell=True, stdout=PIPE, stderr=PIPE) output, error = proc.communicate() retval = proc.returncode retries = retries + 1 if retval: - if logCallback: - logCallback("Non-zero return code from '%s'" % command) + if log_callback: + log_callback("Non-zero return code from '%s'" % command) raise BackupError("Non-zero return code from '%s'" % command) else: return output @@ -163,53 +163,53 @@ runAndReturn = staticmethod(runAndReturn) class PageAndEditStats(object): - def __init__(self, wiki, dbName, errorCallback=None): - self.totalPages = None - self.totalEdits = None + def __init__(self, wiki, db_name, error_callback=None): + self.total_pages = None + self.total_edits = None self.wiki = wiki - self.dbName = dbName - self.dbServerInfo = DbServerInfo(wiki, dbName, errorCallback) - self.getStatistics(self.wiki.config, dbName) + self.dbName = db_name + self.dbServerInfo = DbServerInfo(wiki, db_name, error_callback) + self.get_statistics(self.wiki.config, db_name) - def getStatistics(self, dbName, ignore): + def get_statistics(self, db_name, ignore): """Get statistics for the wiki""" - query = "select MAX(page_id) from %spage;" % self.dbServerInfo.dBTablePrefix + query = "select MAX(page_id) from %spage;" % self.dbServerInfo.db_table_prefix results = None retries = 0 maxretries = 5 - results = self.dbServerInfo.runSqlAndGetOutput(query) + results = self.dbServerInfo.run_sql_and_get_output(query) while results == None and retries < maxretries: retries = retries + 1 time.sleep(5) - results = self.dbServerInfo.runSqlAndGetOutput(query) + results = self.dbServerInfo.run_sql_and_get_output(query) if not results: return 1 lines = results.splitlines() if lines and lines[1]: - self.totalPages = int(lines[1]) - query = "select MAX(rev_id) from %srevision;" % self.dbServerInfo.dBTablePrefix + self.total_pages = int(lines[1]) + query = "select MAX(rev_id) from %srevision;" % self.dbServerInfo.db_table_prefix retries = 0 results = None - results = self.dbServerInfo.runSqlAndGetOutput(query) + results = self.dbServerInfo.run_sql_and_get_output(query) while results == None and retries < maxretries: retries = retries + 1 time.sleep(5) - results = self.dbServerInfo.runSqlAndGetOutput(query) + results = self.dbServerInfo.run_sql_and_get_output(query) if not results: return 1 lines = results.splitlines() if lines and lines[1]: - self.totalEdits = int(lines[1]) + self.total_edits = int(lines[1]) return 0 - def getTotalPages(self): - return self.totalPages + def get_total_pages(self): + return self.total_pages - def getTotalEdits(self): - return self.totalEdits + def get_total_edits(self): + return self.total_edits class RunInfoFile(object): @@ -218,20 +218,20 @@ self._enabled = enabled self.verbose = verbose - def saveDumpRunInfoFile(self, text): + def save_dump_runinfo_file(self, text): """Write out a simple text file with the status for this wiki's dump.""" if self._enabled: try: - self._writeDumpRunInfoFile(text) + self._write_dump_runinfo_file(text) except: if self.verbose: exc_type, exc_value, exc_traceback = sys.exc_info() sys.stderr.write(repr(traceback.format_exception(exc_type, exc_value, exc_traceback))) sys.stderr.write("Couldn't save dump run info file. Continuing anyways\n") - def statusOfOldDumpIsDone(self, runner, date, jobName, jobDesc): - oldDumpRunInfoFilename=self._getDumpRunInfoFileName(date) - status = self._getStatusForJobFromRunInfoFile(oldDumpRunInfoFilename, jobName) + def status_of_old_dump_is_done(self, runner, date, job_name, job_desc): + old_dump_runinfo_filename=self._get_dump_runinfo_filename(date) + status = self._get_job_status_from_runinfo(old_dump_runinfo_filename, job_name) if status == "done": return 1 elif not status == None: @@ -239,26 +239,26 @@ return 0 # ok, there was no info there to be had, try the index file. yuck. - indexFilename = os.path.join(runner.wiki.publicDir(), date, runner.wiki.config.perDumpIndex) - status = self._getStatusForJobFromIndexFile(indexFilename, jobDesc) + index_filename = os.path.join(runner.wiki.publicDir(), date, runner.wiki.config.perDumpIndex) + status = self._get_job_status_from_html(index_filename, job_desc) if status == "done": return 1 else: return 0 - def getOldRunInfoFromFile(self): + def get_old_runinfo_from_file(self): # read the dump run info file in, if there is one, and get info about which dumps # have already been run and whether they were successful - dumpRunInfoFileName = self._getDumpRunInfoFileName() + dump_runinfo_filename = self._get_dump_runinfo_filename() results = [] - if not os.path.exists(dumpRunInfoFileName): + if not os.path.exists(dump_runinfo_filename): return False try: - infile = open(dumpRunInfoFileName, "r") + infile = open(dump_runinfo_filename, "r") for line in infile: - results.append(self._getOldRunInfoFromLine(line)) + results.append(self._get_old_runinfo_from_line(line)) infile.close return results except: @@ -270,7 +270,7 @@ # # functions internal to the class # - def _getDumpRunInfoFileName(self, date=None): + def _get_dump_runinfo_filename(self, date=None): # sometimes need to get this info for an older run to check status of a file for # possible prefetch if date: @@ -278,51 +278,51 @@ else: return os.path.join(self.wiki.publicDir(), self.wiki.date, "dumpruninfo.txt") - def _getDumpRunInfoDirName(self, date=None): + def _get_dump_runinfo_dirname(self, date=None): if date: return os.path.join(self.wiki.publicDir(), date) else: return os.path.join(self.wiki.publicDir(), self.wiki.date) # format: name:%; updated:%; status:% - def _getOldRunInfoFromLine(self, line): + def _get_old_runinfo_from_line(self, line): # get rid of leading/trailing/blanks line = line.strip(" ") line = line.replace("\n", "") fields = line.split(';', 2) - dumpRunInfo = RunInfo() + dump_runinfo = RunInfo() for field in fields: field = field.strip(" ") - (fieldName, separator, fieldValue) = field.partition(':') - if fieldName == "name": - dumpRunInfo.setName(fieldValue) - elif fieldName == "status": - dumpRunInfo.setStatus(fieldValue, False) - elif fieldName == "updated": - dumpRunInfo.setUpdated(fieldValue) - return dumpRunInfo + (fieldname, separator, field_value) = field.partition(':') + if fieldname == "name": + dump_runinfo.setName(field_value) + elif fieldname == "status": + dump_runinfo.setStatus(field_value, False) + elif fieldname == "updated": + dump_runinfo.setUpdated(field_value) + return dump_runinfo - def _writeDumpRunInfoFile(self, text): - directory = self._getDumpRunInfoDirName() - dumpRunInfoFilename = self._getDumpRunInfoFileName() + def _write_dump_runinfo_file(self, text): + directory = self._get_dump_runinfo_dirname() + dump_runinfo_filename = self._get_dump_runinfo_filename() # FileUtils.writeFile(directory, dumpRunInfoFilename, text, self.wiki.config.fileperms) - FileUtils.writeFileInPlace(dumpRunInfoFilename, text, self.wiki.config.fileperms) + FileUtils.writeFileInPlace(dump_runinfo_filename, text, self.wiki.config.fileperms) # format: name:%; updated:%; status:% - def _getStatusForJobFromRunInfoFileLine(self, line, jobName): + def _get_job_status_from_runinfo_line(self, line, job_name): # get rid of leading/trailing/embedded blanks line = line.replace(" ", "") line = line.replace("\n", "") fields = line.split(';', 2) for field in fields: - (fieldName, separator, fieldValue) = field.partition(':') - if fieldName == "name": - if not fieldValue == jobName: + (fieldname, separator, field_value) = field.partition(':') + if fieldname == "name": + if not field_value == job_name: return None - elif fieldName == "status": - return fieldValue + elif fieldname == "status": + return field_value - def _getStatusForJobFromRunInfoFile(self, filename, jobName=""): + def _get_job_status_from_runinfo(self, filename, job_name=""): # read the dump run info file in, if there is one, and find out whether # a particular job (one step only, not a multiple piece job) has been # already run and whether it was successful (use to examine status @@ -330,7 +330,7 @@ try: infile = open(filename, "r") for line in infile: - result = self._getStatusForJobFromRunInfoFileLine(line, jobName) + result = self._get_job_status_from_runinfo_line(line, job_name) if not result == None: return result infile.close @@ -342,7 +342,7 @@ return None # find desc in there, look for "class='done'" - def _getStatusForJobFromIndexFileLine(self, line, desc): + def _get_job_status_from_html_line(self, line, desc): if not ">"+desc+"<" in line: return None if "<li class='done'>" in line: @@ -350,7 +350,7 @@ else: return "other" - def _getStatusForJobFromIndexFile(self, filename, desc): + def _get_job_status_from_html(self, filename, desc): # read the index file in, if there is one, and find out whether # a particular job (one step only, not a multiple piece job) has been # already run and whether it was successful (use to examine status @@ -358,7 +358,7 @@ try: infile = open(filename, "r") for line in infile: - result = self._getStatusForJobFromIndexFileLine(line, desc) + result = self._get_job_status_from_html_line(line, desc) if not result == None: return result infile.close @@ -371,11 +371,11 @@ class RunInfo(object): - def __init__(self, name="", status="", updated="", toBeRun=False): + def __init__(self, name="", status="", updated="", to_run=False): self._name = name self._status = status self._updated = updated - self._toBeRun = toBeRun + self._to_run = to_run def name(self): return self._name @@ -386,20 +386,20 @@ def updated(self): return self._updated - def toBeRun(self): - return self._toBeRun + def to_run(self): + return self._to_run def setName(self, name): self._name = name - def setStatus(self, status, setUpdated=True): + def setStatus(self, status, set_updated=True): self._status = status def setUpdated(self, updated): self._updated = updated - def setToBeRun(self, toBeRun): - self._toBeRun = toBeRun + def set_to_run(self, to_run): + self._to_run = to_run # so if the pages/revsPerChunkAbstract/History are just one number it means @@ -407,59 +407,59 @@ # otherwise we get passed alist that says "here's now many for each chunk and it's this many chunks. # extra pages/revs go in the last chunk, stuck on the end. too bad. :-P class Chunk(object,): - def __init__(self, wiki, dbName, errorCallback=None): + def __init__(self, wiki, db_name, error_callback=None): - self._dbName = dbName + self._dbName = db_name self.wiki = wiki self._chunks_enabled = self.wiki.config.chunksEnabled if self._chunks_enabled: - self.Stats = PageAndEditStats(self.wiki, dbName, errorCallback) - if not self.Stats.totalEdits or not self.Stats.totalPages: + self.Stats = PageAndEditStats(self.wiki, self._dbName, error_callback) + if not self.Stats.total_edits or not self.Stats.total_pages: raise BackupError("Failed to get DB stats, exiting") if self.wiki.config.chunksForAbstract: # we add 200 padding to cover new pages that may be added - pagesPerChunk = self.Stats.totalPages/int(self.wiki.config.chunksForAbstract) + 200 - self._pagesPerChunkAbstract = [pagesPerChunk for i in range(0, int(self.wiki.config.chunksForAbstract))] + pagesPerChunk = self.Stats.total_pages/int(self.wiki.config.chunksForAbstract) + 200 + self._pages_per_chunk_abstract = [pagesPerChunk for i in range(0, int(self.wiki.config.chunksForAbstract))] else: - self._pagesPerChunkAbstract = self.convertCommaSepLineToNumbers(self.wiki.config.pagesPerChunkAbstract) + self._pages_per_chunk_abstract = self.convert_comma_sep(self.wiki.config.pagesPerChunkAbstract) - self._pagesPerChunkHistory = self.convertCommaSepLineToNumbers(self.wiki.config.pagesPerChunkHistory) - self._revsPerChunkHistory = self.convertCommaSepLineToNumbers(self.wiki.config.revsPerChunkHistory) - self._recombineHistory = self.wiki.config.recombineHistory + self._pages_per_chunk_history = self.convert_comma_sep(self.wiki.config.pagesPerChunkHistory) + self._revs_per_chunk_history = self.convert_comma_sep(self.wiki.config.revsPerChunkHistory) + self._recombine_history = self.wiki.config.recombineHistory else: - self._pagesPerChunkHistory = False - self._revsPerChunkHistory = False - self._pagesPerChunkAbstract = False - self._recombineHistory = False + self._pages_per_chunk_history = False + self._revs_per_chunk_history = False + self._pages_per_chunk_abstract = False + self._recombine_history = False if self._chunks_enabled: - if self._revsPerChunkHistory: - if len(self._revsPerChunkHistory) == 1: - self._numChunksHistory = self.getNumberOfChunksForXMLDumps(self.Stats.totalEdits, self._pagesPerChunkHistory[0]) - self._revsPerChunkHistory = [self._revsPerChunkHistory[0] for i in range(self._numChunksHistory)] + if self._revs_per_chunk_history: + if len(self._revs_per_chunk_history) == 1: + self._num_chunks_history = self.getNumberOfChunksForXMLDumps(self.Stats.total_edits, self._pages_per_chunk_history[0]) + self._revs_per_chunk_history = [self._revs_per_chunk_history[0] for i in range(self._num_chunks_history)] else: - self._numChunksHistory = len(self._revsPerChunkHistory) + self._num_chunks_history = len(self._revs_per_chunk_history) # here we should generate the number of pages per chunk based on number of revs. # ...next code update! FIXME - # self._pagesPerChunkHistory = .... - elif self._pagesPerChunkHistory: - if len(self._pagesPerChunkHistory) == 1: - self._numChunksHistory = self.getNumberOfChunksForXMLDumps(self.Stats.totalPages, self._pagesPerChunkHistory[0]) - self._pagesPerChunkHistory = [self._pagesPerChunkHistory[0] for i in range(self._numChunksHistory)] + # self._pages_per_chunk_history = .... + elif self._pages_per_chunk_history: + if len(self._pages_per_chunk_history) == 1: + self._num_chunks_history = self.getNumberOfChunksForXMLDumps(self.Stats.total_pages, self._pages_per_chunk_history[0]) + self._pages_per_chunk_history = [self._pages_per_chunk_history[0] for i in range(self._num_chunks_history)] else: - self._numChunksHistory = len(self._pagesPerChunkHistory) + self._num_chunks_history = len(self._pages_per_chunk_history) else: - self._numChunksHistory = 0 + self._num_chunks_history = 0 - if self._pagesPerChunkAbstract: - if len(self._pagesPerChunkAbstract) == 1: - self._numChunksAbstract = self.getNumberOfChunksForXMLDumps(self.Stats.totalPages, self._pagesPerChunkAbstract[0]) - self._pagesPerChunkAbstract = [self._pagesPerChunkAbstract[0] for i in range(self._numChunksAbstract)] + if self._pages_per_chunk_abstract: + if len(self._pages_per_chunk_abstract) == 1: + self._num_chunks_abstract = self.getNumberOfChunksForXMLDumps(self.Stats.total_pages, self._pages_per_chunk_abstract[0]) + self._pages_per_chunk_abstract = [self._pages_per_chunk_abstract[0] for i in range(self._num_chunks_abstract)] else: - self._numChunksAbstract = len(self._pagesPerChunkAbstract) + self._num_chunks_abstract = len(self._pages_per_chunk_abstract) else: - self._numChunksAbstract = 0 + self._num_chunks_abstract = 0 - def convertCommaSepLineToNumbers(self, line): + def convert_comma_sep(self, line): if line == "": return False result = line.split(',') @@ -469,33 +469,33 @@ numbers.append(int(field)) return numbers - def getPagesPerChunkAbstract(self): - return self._pagesPerChunkAbstract + def get_pages_per_chunk_abstract(self): + return self._pages_per_chunk_abstract - def getNumChunksAbstract(self): - return self._numChunksAbstract + def get_num_chunks_abstract(self): + return self._num_chunks_abstract def getPagesPerChunkHistory(self): - return self._pagesPerChunkHistory + return self._pages_per_chunk_history - def getNumChunksHistory(self): - return self._numChunksHistory + def get_num_chunks_history(self): + return self._num_chunks_history def chunksEnabled(self): return self._chunks_enabled def recombineHistory(self): - return self._recombineHistory + return self._recombine_history # args: total (pages or revs), and the number of (pages or revs) per chunk. - def getNumberOfChunksForXMLDumps(self, total, perChunk): + def getNumberOfChunksForXMLDumps(self, total, per_chunk): if not total: # default: no chunking. return 0 else: - chunks = int(total/perChunk) + chunks = int(total/per_chunk) # more smaller chunks are better, we want speed - if (total - (chunks * perChunk)) > 0: + if (total - (chunks * per_chunk)) > 0: chunks = chunks + 1 if chunks == 1: return 0 diff --git a/xmldumps-backup/worker.py b/xmldumps-backup/worker.py index de1f5ef..6a52fb6 100644 --- a/xmldumps-backup/worker.py +++ b/xmldumps-backup/worker.py @@ -97,41 +97,41 @@ raise BackupError("You cannot specify a checkpoint file with the job %s, exiting.\n" % self._single_job) self.dumpItems = [PrivateTable("user", "usertable", "User account data."), - PrivateTable("watchlist", "watchlisttable", "Users' watchlist settings."), - PrivateTable("ipblocks", "ipblockstable", "Data for blocks of IP addresses, ranges, and users."), - PrivateTable("archive", "archivetable", "Deleted page and revision data."), - #PrivateTable("updates", "updatestable", "Update dataset for OAI updater system."), - PrivateTable("logging", "loggingtable", "Data for various events (deletions, uploads, etc)."), - PrivateTable("oldimage", "oldimagetable", "Metadata on prior versions of uploaded images."), - #PrivateTable("filearchive", "filearchivetable", "Deleted image data"), + PrivateTable("watchlist", "watchlisttable", "Users' watchlist settings."), + PrivateTable("ipblocks", "ipblockstable", "Data for blocks of IP addresses, ranges, and users."), + PrivateTable("archive", "archivetable", "Deleted page and revision data."), + #PrivateTable("updates", "updatestable", "Update dataset for OAI updater system."), + PrivateTable("logging", "loggingtable", "Data for various events (deletions, uploads, etc)."), + PrivateTable("oldimage", "oldimagetable", "Metadata on prior versions of uploaded images."), + #PrivateTable("filearchive", "filearchivetable", "Deleted image data"), - PublicTable("site_stats", "sitestatstable", "A few statistics such as the page count."), - PublicTable("image", "imagetable", "Metadata on current versions of uploaded media/files."), - #PublicTable("oldimage", "oldimagetable", "Metadata on prior versions of uploaded media/files."), - PublicTable("pagelinks", "pagelinkstable", "Wiki page-to-page link records."), - PublicTable("categorylinks", "categorylinkstable", "Wiki category membership link records."), - PublicTable("imagelinks", "imagelinkstable", "Wiki media/files usage records."), - PublicTable("templatelinks", "templatelinkstable", "Wiki template inclusion link records."), - PublicTable("externallinks", "externallinkstable", "Wiki external URL link records."), - PublicTable("langlinks", "langlinkstable", "Wiki interlanguage link records."), - #PublicTable("interwiki", "interwikitable", "Set of defined interwiki prefixes and links for this wiki."), - PublicTable("user_groups", "usergroupstable", "User group assignments."), - PublicTable("category", "categorytable", "Category information."), + PublicTable("site_stats", "sitestatstable", "A few statistics such as the page count."), + PublicTable("image", "imagetable", "Metadata on current versions of uploaded media/files."), + #PublicTable("oldimage", "oldimagetable", "Metadata on prior versions of uploaded media/files."), + PublicTable("pagelinks", "pagelinkstable", "Wiki page-to-page link records."), + PublicTable("categorylinks", "categorylinkstable", "Wiki category membership link records."), + PublicTable("imagelinks", "imagelinkstable", "Wiki media/files usage records."), + PublicTable("templatelinks", "templatelinkstable", "Wiki template inclusion link records."), + PublicTable("externallinks", "externallinkstable", "Wiki external URL link records."), + PublicTable("langlinks", "langlinkstable", "Wiki interlanguage link records."), + #PublicTable("interwiki", "interwikitable", "Set of defined interwiki prefixes and links for this wiki."), + PublicTable("user_groups", "usergroupstable", "User group assignments."), + PublicTable("category", "categorytable", "Category information."), - PublicTable("page", "pagetable", "Base per-page data (id, title, old restrictions, etc)."), - PublicTable("page_restrictions", "pagerestrictionstable", "Newer per-page restrictions table."), - PublicTable("page_props", "pagepropstable", "Name/value pairs for pages."), - PublicTable("protected_titles", "protectedtitlestable", "Nonexistent pages that have been protected."), - #PublicTable("revision", #revisiontable", "Base per-revision data (does not include text)."), // safe? - #PrivateTable("text", "texttable", "Text blob storage. May be compressed, etc."), // ? - PublicTable("redirect", "redirecttable", "Redirect list"), - PublicTable("iwlinks", "iwlinkstable", "Interwiki link tracking records"), - PublicTable("geo_tags", "geotagstable", "List of pages' geographical coordinates"), + PublicTable("page", "pagetable", "Base per-page data (id, title, old restrictions, etc)."), + PublicTable("page_restrictions", "pagerestrictionstable", "Newer per-page restrictions table."), + PublicTable("page_props", "pagepropstable", "Name/value pairs for pages."), + PublicTable("protected_titles", "protectedtitlestable", "Nonexistent pages that have been protected."), + #PublicTable("revision", #revisiontable", "Base per-revision data (does not include text)."), // safe? + #PrivateTable("text", "texttable", "Text blob storage. May be compressed, etc."), // ? + PublicTable("redirect", "redirecttable", "Redirect list"), + PublicTable("iwlinks", "iwlinkstable", "Interwiki link tracking records"), + PublicTable("geo_tags", "geotagstable", "List of pages' geographical coordinates"), - TitleDump("pagetitlesdump", "List of page titles in main namespace"), - AllTitleDump("allpagetitlesdump", "List of all page titles"), + TitleDump("pagetitlesdump", "List of page titles in main namespace"), + AllTitleDump("allpagetitlesdump", "List of all page titles"), - AbstractDump("abstractsdump", "Extracted page abstracts for Yahoo", self._getChunkToDo("abstractsdump"), self.wiki.dbName, self.chunkInfo.getPagesPerChunkAbstract())] + AbstractDump("abstractsdump", "Extracted page abstracts for Yahoo", self._get_chunk_to_do("abstractsdump"), self.wiki.dbName, self.chunkInfo.get_pages_per_chunk_abstract())] if self.chunkInfo.chunksEnabled(): self.dumpItems.append(RecombineAbstractDump("abstractsdumprecombine", "Recombine extracted page abstracts for Yahoo", self.find_item_by_name('abstractsdump'))) @@ -223,7 +223,7 @@ "index of page titles/ids and offsets into the file. Useful for offline readers, or for parallel processing of pages.", self.find_item_by_name(input_for_multistream), self.wiki, None)) - results = self._runinfo_file.getOldRunInfoFromFile() + results = self._runinfo_file.get_old_runinfo_from_file() if results: for runinfo_obj in results: self._set_dump_item_runinfo(runinfo_obj) @@ -261,7 +261,7 @@ if item.name in self.skip_jobs: item.setSkipped() elif not skipgood or item.status() != "done": - item.setToBeRun(True) + item.set_to_run(True) return True else: for item in self.dumpItems: @@ -269,7 +269,7 @@ if item.name in self.skip_jobs: item.setSkipped() elif not skipgood or item.status() != "done": - item.setToBeRun(True) + item.set_to_run(True) return True if job == "noop" or job == "latestlinks" or job == "createdirs": return True @@ -287,12 +287,12 @@ i = 0; for item in self.dumpItems: i = i + 1; - if item.toBeRun(): + if item.to_run(): for j in range(i, len(self.dumpItems)): if item.name in self.skip_jobs: item.setSkipped() elif not skipgood or item.status() != "done": - self.dumpItems[j].setToBeRun(True) + self.dumpItems[j].set_to_run(True) break def mark_all_jobs_to_run(self, skipgood=False): @@ -301,7 +301,7 @@ if item.name() in self.skip_jobs: item.setSkipped() elif not skipgood or item.status() != "done": - item.setToBeRun(True) + item.set_to_run(True) def find_item_by_name(self, name): for item in self.dumpItems: @@ -323,7 +323,7 @@ if item.name() == runInfo.name(): item.setStatus(runInfo.status(), False) item.setUpdated(runInfo.updated()) - item.setToBeRun(runInfo.toBeRun()) + item.set_to_run(runInfo.to_run()) return True return False @@ -596,10 +596,10 @@ for item in self.dumpItemList.dumpItems: Maintenance.exit_if_in_maintenance_mode("In maintenance mode, exiting dump of %s at step %s" % (self.dbName, item.name())) - if item.toBeRun(): + if item.to_run(): item.start(self) self.status.update_status_files() - self.runInfoFile.saveDumpRunInfoFile(self.dumpItemList.report_dump_runinfo()) + self.runInfoFile.save_dump_runinfo_file(self.dumpItemList.report_dump_runinfo()) try: item.dump(self) except Exception, ex: @@ -643,7 +643,7 @@ # previously in "waiting" are still in status "waiting" self.status.update_status_files("partialdone") - self.runInfoFile.saveDumpRunInfoFile(self.dumpItemList.report_dump_runinfo()) + self.runInfoFile.save_dump_runinfo_file(self.dumpItemList.report_dump_runinfo()) # if any job succeeds we might as well make the sym link if self.status.fail_count < 1: @@ -718,7 +718,7 @@ self.sym_links.cleanup_symlinks() for item in self.dumpItemList.dumpItems: - if item.toBeRun(): + if item.to_run(): dump_names = item.listDumpNames() if type(dump_names).__name__!='list': dump_names = [dump_names] @@ -789,7 +789,7 @@ # failed to get the run's info so let's call it 'didn't run' return False - results = dumpItemList._runinfo_file.getOldRunInfoFromFile() + results = dumpItemList._runinfo_file.get_old_runinfo_from_file() if results: for runinfo_obj in results: dumpItemList._set_dump_item_runinfo(runinfo_obj) @@ -806,7 +806,7 @@ # see if there are any to run. no? then return True (all job(s) done) # otherwise return False (still some to do) for item in dumpItemList.dumpItems: - if item.toBeRun(): + if item.to_run(): return False return True else: diff --git a/xmldumps-backup/xmlstreams.py b/xmldumps-backup/xmlstreams.py index 72adbbd..54b1caf 100644 --- a/xmldumps-backup/xmlstreams.py +++ b/xmldumps-backup/xmlstreams.py @@ -150,12 +150,12 @@ wiki = WikiDump.Wiki(wikiconf, wikidb) db_info = worker.DbServerInfo(wiki, wikidb) - query = "select MAX(%s) from %s%s;" % (id_field, db_info.dBTablePrefix, table) + query = "select MAX(%s) from %s%s;" % (id_field, db_info.db_table_prefix, table) results = None retries = 0 maxretries = 5 end = 0 - results = db_info.runSqlAndGetOutput(query) + results = db_info.run_sql_and_get_output(query) if results: lines = results.splitlines() if lines and lines[1]: @@ -167,7 +167,7 @@ while results is None and retries < maxretries: retries = retries + 1 time.sleep(5) - results = db_info.runSqlAndGetOutput(query) + results = db_info.run_sql_and_get_output(query) if not results: continue lines = results.splitlines() -- To view, visit https://gerrit.wikimedia.org/r/242464 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I9c5104fa802d4bf1b85c4c61bd6f6419fa7e04f1 Gerrit-PatchSet: 1 Gerrit-Project: operations/dumps Gerrit-Branch: ariel Gerrit-Owner: ArielGlenn <ar...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits