Sharihareswara has uploaded a new change for review.
https://gerrit.wikimedia.org/r/99540
Change subject: performance improvements for dbaccess version
......................................................................
performance improvements for dbaccess version
* Check multiple names in a single db query
* better variable names
* fixed test
* Unicode problems fixed
* run a set operation to get the list of missing names, not a for loop
Thanks to pairing with Allison, Alan, and Tom.
Change-Id: Iafe9a58dbead12c1e489b8fae3fe13408d92b75a
---
M webapp/missing.py
M webapp/tests.py
2 files changed, 21 insertions(+), 19 deletions(-)
git pull
ssh://gerrit.wikimedia.org:29418/mediawiki/tools/missing-from-wikipedia
refs/changes/40/99540/1
diff --git a/webapp/missing.py b/webapp/missing.py
index 697cb8f..93f6ceb 100755
--- a/webapp/missing.py
+++ b/webapp/missing.py
@@ -46,13 +46,14 @@
# special case with "lastname, firstname, suffix"
# fix to "firstname lastname, suffix"
if (len(parts) == 3):
- final = "%s %s, %s" % (parts[0], parts[1], parts[2])
+ inprogress = "%s %s, %s" % (parts[0], parts[1], parts[2])
else:
- final = " ".join(parts)
+ inprogress = " ".join(parts)
# replace hyphens
- final = final.replace("- ", "-")
- spaces_to_underscores = final.replace(" ", "_")
- return spaces_to_underscores
+ inprogress = inprogress.replace("- ", "-")
+ spaces_to_underscores = inprogress.replace(" ", "_")
+ final = spaces_to_underscores.encode('utf-8')
+ return final
return [process_name(name) for name in names]
@@ -62,6 +63,11 @@
yield names[:CHUNK_SIZE]
names = names[CHUNK_SIZE:]
+def getconnection(wikipedia_language):
+ db = MySQLdb.connect(read_default_file='~/replica.my.cnf',
+ host=wikipedia_language+"wiki.labsdb",
+ db=wikipedia_language+"wiki_p")
+ return db.cursor()
def leftout(massaged_names, wikipedia_language):
"""Return list of people who don't have pages on the wiki.
@@ -71,23 +77,19 @@
Uses a direct MySQL check on the replicated database.
"""
-
- db = MySQLdb.connect(read_default_file='~/replica.my.cnf',
- host=wikipedia_language+"wiki.labsdb",
- db=wikipedia_language+"wiki_p")
- cur = db.cursor()
- resultlist = []
- for name in massaged_names:
- cur.execute("SELECT exists (SELECT page_id FROM page WHERE page_title
= %s AND page_namespace=0);" , (name.encode('utf-8'),))
- sqlresults = cur.fetchall()
- if sqlresults[0] == (0L,): # the page does not exist
- resultlist.append(name)
- return resultlist
+ cur = getconnection(wikipedia_language)
+ sql = "SELECT page_title FROM page WHERE page_title in (%s) AND
page_namespace=0;"
+ format_strings = ','.join(['%s'] * len(massaged_names))
+ cur.execute(sql % format_strings , massaged_names)
+ sqlresults = cur.fetchall()
+ exists_set = set(map(lambda x: x[0], sqlresults))
+ resultset = set(massaged_names).difference(exists_set)
+ return list(resultset)
def outputfile(resultlist, filename):
with codecs.open(filename, encoding='utf-8', mode='a') as out_fd:
- [out_fd.write("%s\n" % pagename) for pagename in resultlist]
+ [out_fd.write("%s\n" % pagename.decode('utf-8')) for pagename in
resultlist]
def nameoutputfile(name):
diff --git a/webapp/tests.py b/webapp/tests.py
index 9d49008..a06e583 100755
--- a/webapp/tests.py
+++ b/webapp/tests.py
@@ -23,7 +23,7 @@
def test_name_reversal_hyphenation(self):
# Test that names of 1 or 2 items reverse & remove hyphen spaces properly.
testnames = ["Mazari, Abu ʿAbd Allah Muhammad al-", "Mlapa III",
"Andrade, Mário Pinto de", "Bayram al-Khaʾmis, Mohamed", "Be’alu Girma",
"Bédié, Henri-Konan", "Okwei"]
- expectedresult = ["Abu ʿAbd Allah Muhammad al-Mazari", "Mlapa III",
"Mário Pinto de Andrade", "Mohamed Bayram al-Khaʾmis", "Be’alu Girma",
"Henri-Konan Bédié", "Okwei"]
+ expectedresult = ["Abu_ʿAbd_Allah_Muhammad_al-Mazari", "Mlapa_III",
"Mário_Pinto_de_Andrade", "Mohamed_Bayram_al-Khaʾmis", "Be’alu_Girma",
"Henri-Konan_Bédié", "Okwei"]
testresult = massagenames(testnames)
self.assertEqual(testresult, expectedresult)
--
To view, visit https://gerrit.wikimedia.org/r/99540
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Iafe9a58dbead12c1e489b8fae3fe13408d92b75a
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/tools/missing-from-wikipedia
Gerrit-Branch: master
Gerrit-Owner: Sharihareswara <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits