Sharihareswara has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/99540


Change subject: performance improvements for dbaccess version
......................................................................

performance improvements for dbaccess version

* Check multiple names in a single db query
* better variable names
* fixed test
* Unicode problems fixed
* run a set operation to get the list of missing names, not a for loop

Thanks to pairing with Allison, Alan, and Tom.

Change-Id: Iafe9a58dbead12c1e489b8fae3fe13408d92b75a
---
M webapp/missing.py
M webapp/tests.py
2 files changed, 21 insertions(+), 19 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/tools/missing-from-wikipedia 
refs/changes/40/99540/1

diff --git a/webapp/missing.py b/webapp/missing.py
index 697cb8f..93f6ceb 100755
--- a/webapp/missing.py
+++ b/webapp/missing.py
@@ -46,13 +46,14 @@
         # special case with "lastname, firstname, suffix"
         # fix to "firstname lastname, suffix"
         if (len(parts) == 3):
-            final = "%s %s, %s" % (parts[0], parts[1], parts[2])
+            inprogress = "%s %s, %s" % (parts[0], parts[1], parts[2])
         else:
-            final = " ".join(parts)
+            inprogress = " ".join(parts)
         # replace hyphens
-        final = final.replace("- ", "-")
-        spaces_to_underscores = final.replace(" ", "_")
-        return spaces_to_underscores
+        inprogress = inprogress.replace("- ", "-")
+        spaces_to_underscores = inprogress.replace(" ", "_")
+        final = spaces_to_underscores.encode('utf-8')
+        return final
     return [process_name(name) for name in names]
 
 
@@ -62,6 +63,11 @@
         yield names[:CHUNK_SIZE]
         names = names[CHUNK_SIZE:]
 
+def getconnection(wikipedia_language):
+    db = MySQLdb.connect(read_default_file='~/replica.my.cnf',
+                         host=wikipedia_language+"wiki.labsdb",
+                         db=wikipedia_language+"wiki_p")
+    return db.cursor()
 
 def leftout(massaged_names, wikipedia_language):
     """Return list of people who don't have pages on the wiki.
@@ -71,23 +77,19 @@
     Uses a direct MySQL check on the replicated database.
 """
 
-
-    db = MySQLdb.connect(read_default_file='~/replica.my.cnf',
-                         host=wikipedia_language+"wiki.labsdb",
-                         db=wikipedia_language+"wiki_p")
-    cur = db.cursor()
-    resultlist = []
-    for name in massaged_names:
-        cur.execute("SELECT exists (SELECT page_id FROM page WHERE page_title 
= %s AND page_namespace=0);" , (name.encode('utf-8'),))
-        sqlresults = cur.fetchall()
-        if sqlresults[0] == (0L,):  # the page does not exist
-            resultlist.append(name)
-    return resultlist
+    cur = getconnection(wikipedia_language)
+    sql = "SELECT page_title FROM page WHERE page_title in (%s) AND 
page_namespace=0;"
+    format_strings = ','.join(['%s'] * len(massaged_names))
+    cur.execute(sql % format_strings , massaged_names)
+    sqlresults = cur.fetchall()
+    exists_set = set(map(lambda x: x[0], sqlresults))
+    resultset = set(massaged_names).difference(exists_set)
+    return list(resultset)
 
 
 def outputfile(resultlist, filename):
     with codecs.open(filename, encoding='utf-8', mode='a') as out_fd:
-        [out_fd.write("%s\n" % pagename) for pagename in resultlist]
+        [out_fd.write("%s\n" % pagename.decode('utf-8')) for pagename in 
resultlist]
 
 
 def nameoutputfile(name):
diff --git a/webapp/tests.py b/webapp/tests.py
index 9d49008..a06e583 100755
--- a/webapp/tests.py
+++ b/webapp/tests.py
@@ -23,7 +23,7 @@
     def test_name_reversal_hyphenation(self):
     # Test that names of 1 or 2 items reverse & remove hyphen spaces properly.
         testnames = ["Mazari, Abu ʿAbd Allah Muhammad al-", "Mlapa III", 
"Andrade, Mário Pinto de", "Bayram al-Khaʾmis, Mohamed", "Be’alu Girma", 
"Bédié, Henri-Konan", "Okwei"]
-        expectedresult = ["Abu ʿAbd Allah Muhammad al-Mazari", "Mlapa III", 
"Mário Pinto de Andrade", "Mohamed Bayram al-Khaʾmis", "Be’alu Girma", 
"Henri-Konan Bédié", "Okwei"]
+        expectedresult = ["Abu_ʿAbd_Allah_Muhammad_al-Mazari", "Mlapa_III", 
"Mário_Pinto_de_Andrade", "Mohamed_Bayram_al-Khaʾmis", "Be’alu_Girma", 
"Henri-Konan_Bédié", "Okwei"]
         testresult = massagenames(testnames)
         self.assertEqual(testresult, expectedresult)
 

-- 
To view, visit https://gerrit.wikimedia.org/r/99540
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iafe9a58dbead12c1e489b8fae3fe13408d92b75a
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/tools/missing-from-wikipedia
Gerrit-Branch: master
Gerrit-Owner: Sharihareswara <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to