http://www.mediawiki.org/wiki/Special:Code/MediaWiki/95802

Revision: 95802
Author:   swalker
Date:     2011-08-30 18:36:27 +0000 (Tue, 30 Aug 2011)
Log Message:
-----------
Initial commit - script to link WikiProjects to categories

Added Paths:
-----------
    trunk/tools/wsor/wikiprojects/categorylinks_wikiprojects.py

Added: trunk/tools/wsor/wikiprojects/categorylinks_wikiprojects.py
===================================================================
--- trunk/tools/wsor/wikiprojects/categorylinks_wikiprojects.py                 
        (rev 0)
+++ trunk/tools/wsor/wikiprojects/categorylinks_wikiprojects.py 2011-08-30 
18:36:27 UTC (rev 95802)
@@ -0,0 +1,105 @@
+
+import os, sys, MySQLdb, re
+
+conn = MySQLdb.connect(
+host='db42',
+db='shawn',
+read_default_file=os.path.expanduser("~/.my.cnf")
+)
+
+conn2 = MySQLdb.connect(
+host='db42',
+db='shawn',
+read_default_file=os.path.expanduser("~/.my.cnf")
+)
+
+
+def getWikiProjectPage(conn,  page_title):
+       # sys.stdout.write("Searching WikiProject Page for %s\n" % (page_title))
+       cursor = conn.cursor(MySQLdb.cursors.SSCursor)
+       #cursor.execute("""SELECT page_id FROM enwiki.page WHERE 
CONVERT(page_title USING latin1) like %(page_title)s AND page_namespace = 4
+       cursor.execute("""SELECT page_id FROM enwiki.page WHERE page_title = 
%(page_title)s AND page_namespace = 4
+       """,
+       {
+       'page_title': page_title
+       }
+       )
+       edits = cursor.fetchone()
+       #sys.stdout.write(".")
+       if edits:
+               # sys.stdout.write("Found WikiProject Page for %s - %s\n" % 
(page_title, edits[0]))
+               return edits[0]
+       else:
+               # sys.stdout.write("NOT Found WikiProject Page for %s\n" % 
page_title)
+               return None
+
+# Given a categorylink, returnt the WikiProject name by removing anything 
after _article, _page, _member, or _participant
+def parseWikiProject(conn, page_title):
+       # sys.stdout.write("Parsing WikiProject Page for %s\n" % (page_title))
+       if re.search('_article.*$', page_title):
+               wp = re.sub(r'_article.*$','', page_title)
+       
+       elif re.search('_page.*$', page_title):
+               wp = re.sub(r'_page.*$','', page_title)
+       
+       elif re.search('_member.*$', page_title):
+               wp = re.sub(r'_member.*$','', page_title)
+       
+       elif re.search('_participant.*$', page_title):
+               wp = re.sub(r'_participant.*$','', page_title)
+       
+       else:
+               # sys.stdout.write("NO PARSING for %s\n" % (page_title))
+               return None
+       
+       # sys.stdout.write("PARSED as %s\n" % (wp))
+       return wp
+
+
+
+ucursor = conn.cursor(MySQLdb.cursors.SSCursor)
+ucursor.execute("""
+                SELECT DISTINCT
+                                       cl_to
+                               FROM categorylinks
+                               """
+)
+
+
+
+for cat_link in ucursor:
+       wp_page_name = cat_link[0]
+       # sys.stdout.write("Working With WikiProject Category: %s\n" % 
cat_link[0])
+       wikiproject = parseWikiProject(conn2, cat_link[0])
+       # sys.stdout.write("-- Working With WikiProject After Parsing: %s\n" % 
wikiproject)
+       if wikiproject:
+               # sys.stdout.write("WikiProject parsed, setting to %s - %s\n" % 
(cat_link[0], wp_page_name))
+               wp_page_nume = wikiproject
+               
+       wp_page_id = getWikiProjectPage(conn2, wp_page_name)
+       
+       # didn't find a match for the wikiproject name
+       while not wp_page_id:
+               #wp_page_name = trimWikiProjectName(wp_page_name)
+               
+               wp_page_name = wp_page_name[0:wp_page_name.rfind('_')]
+               # sys.stdout.write("-- Trimming WikiProject name to: %s\n" % 
wp_page_name)
+               wp_page_id = getWikiProjectPage(conn2, wp_page_name)
+               # sys.stdout.write("-- Searching for page id: %s\n" % 
wp_page_id)
+
+       if wp_page_id == 33631:
+               sys.stdout.write("-- NOT found after trimming: %s - %s - 
%s\n\n" % (wp_page_name, cat_link[0], wp_page_id))
+               
+       if wp_page_id != 33631:
+               sys.stdout.write("++ FOUND after trimming: %s - %s - %s\n\n" % 
(wp_page_name, cat_link[0], wp_page_id))
+               sys.stdout.write("+")
+               cursor3 = conn2.cursor(MySQLdb.cursors.SSCursor)
+               cursor3.execute("""INSERT INTO categorylinks_wp values 
(%(wp_id)s, %(category)s)
+                                               """,
+                       {
+                       'wp_id': wp_page_id,
+                       'category': cat_link[0]
+                       }
+               )
+       else:
+               sys.stdout.write("-")
\ No newline at end of file


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to