imagecopy_enwp.py

multichill Sat, 07 Aug 2010 09:09:19 -0700

Revision: 8388
Author:   multichill
Date:     2010-08-07 16:08:55 +0000 (Sat, 07 Aug 2010)


Log Message:
-----------
Implemented 3 different threads
1. Fetcher thread gets all information
2. User thread does the user interaction
3. Uploader thread does the actual uploading

This speeds up the bot a lot.

Modified Paths:
--------------
    trunk/pywikipedia/imagecopy_enwp.py

Modified: trunk/pywikipedia/imagecopy_enwp.py
===================================================================
--- trunk/pywikipedia/imagecopy_enwp.py 2010-08-07 13:37:23 UTC (rev 8387)
+++ trunk/pywikipedia/imagecopy_enwp.py 2010-08-07 16:08:55 UTC (rev 8388)
@@ -51,6 +51,7 @@
 import os, sys, re, codecs
 import urllib, httplib, urllib2
 import webbrowser
+from Queue import Queue
 import time, threading
 import wikipedia, config, socket
 import pagegenerators, add_text
@@ -267,255 +268,252 @@
         return (self.filename, self.description, self.date, self.source, 
self.author, self.licensetemplate, self.categories, self.skip)
 
 
-def doiskip(imagepage):
+class imageFetcher(threading.Thread):
     '''
-    Skip this image or not.
-    Returns True if the image is on the skip list, otherwise False
+    Tries to fetch information for all images in the generator
     '''
-    for template in imagepage.templates():
-        if template in skipTemplates:
-            wikipedia.output(u'Found ' + template + u' which is on the 
template skip list')
-            return True
-    return False
+    def __init__ ( self, pagegenerator, prefetchQueue):
+        self.pagegenerator = pagegenerator
+        self.prefetchQueue = prefetchQueue
+        imagerecat.initLists()
+        threading.Thread.__init__ ( self )
 
-def getNewFields(imagepage):
-    '''
-    Build a new description based on the imagepage
-    '''
-    if u'{{Information' in imagepage.get() or u'{{information' in 
imagepage.get():
-        (description, date, source, author) = 
getNewFieldsFromInformation(imagepage)       
-    else:
-        (description, date, source, author) = 
getNewFieldsFromFreetext(imagepage)
+    def run(self):
+        for page in self.pagegenerator:
+            self.processImage(page)
+        self.prefetchQueue.put(None)
+        wikipedia.output(u'Fetched all images.')
+        return True
 
-    licensetemplate = getNewLicensetemplate(imagepage)
-    categories = getNewCategories(imagepage)
-    return (description, date, source, author, licensetemplate, categories)
+    def processImage(self, page):
+        '''
+        Work on a single image
+        '''
+        if page.exists() and (page.namespace() == 6) and (not 
page.isRedirectPage()):
+            imagepage = wikipedia.ImagePage(page.site(), page.title())
 
-def getNewFieldsFromInformation(imagepage):
-    '''
-    '''
-    description = u''
-    date = u''
-    source = u''
-    author = u''
-    permission = u''
-    other_versions = u''
-    text = imagepage.get()
-    # Need to add the permission field
-    # Need to use pywikipedia template parser code
-    regexes 
=[u'\{\{Information[\s\r\n]*\|[\s\r\n]*description[\s\r\n]*=(?P<description>.*)\|[\s\r\n]*source[\s\r\n]*=(?P<source>.*)\|[\s\r\n]*date[\s\r\n]*=(?P<date>.*)\|[\s\r\n]*author[\s\r\n]*=(?P<author>.*)\|[\s\r\n]*permission.*=(?P<permission>[^\}]*)\|[\s\r\n]*other_versions.*=(?P<other_versions>[^\}]*)\}\}',
-              
u'\{\{Information[\s\r\n]*\|[\s\r\n]*description[\s\r\n]*=(?P<description>.*)\|[\s\r\n]*source[\s\r\n]*=(?P<source>.*)\|[\s\r\n]*date[\s\r\n]*=(?P<date>.*)\|[\s\r\n]*author[\s\r\n]*=(?P<author>.*)\|[\s\r\n]*other_versions.*=(?P<other_versions>[^\}]*)\}\}',
              
-              ]
+            #First do autoskip.
+            if self.doiskip(imagepage):
+                wikipedia.output(u'Skipping %s : Got a template on the skip 
list.' % page.title())
+                return False
             
+            text = imagepage.get()
+            foundMatch = False
+            for (regex, replacement) in licenseTemplates:
+                match = re.search(regex, text, re.IGNORECASE)
+                if match:
+                    foundMatch = True
+            if not foundMatch:
+                wikipedia.output(u'Skipping %s : No suitable license template 
was found.' % page.title())
+                return False
+            self.prefetchQueue.put(self.getNewFields(imagepage))
 
-    for regex in regexes:
-        match =re.search(regex, text, re.IGNORECASE|re.DOTALL)
-        if match:
-            description = convertLinks(match.group(u'description').strip(), 
imagepage.site())
-            
-            date = match.group(u'date').strip()
-            if date == u'':
-                date = getUploadDate(imagepage)
+    def doiskip(self, imagepage):
+        '''
+        Skip this image or not.
+        Returns True if the image is on the skip list, otherwise False
+        '''
+        for template in imagepage.templates():
+            if template in skipTemplates:
+                wikipedia.output(u'Found ' + template + u' which is on the 
template skip list')
+                return True
+        return False
 
-            source = getSource(imagepage, 
source=convertLinks(match.group(u'source').strip(), imagepage.site()))
+    def getNewFields(self, imagepage):
+        '''
+        Build a new description based on the imagepage
+        '''
+        if u'{{Information' in imagepage.get() or u'{{information' in 
imagepage.get():
+            (description, date, source, author) = 
self.getNewFieldsFromInformation(imagepage)       
+        else:
+            (description, date, source, author) = 
self.getNewFieldsFromFreetext(imagepage)
 
-            author = convertLinks(match.group(u'author').strip(), 
imagepage.site())
-            if author == u'':
-                author = getAuthorText(imagepage)
-            
-            if u'permission' in match.groupdict():
-                permission = convertLinks(match.group(u'permission').strip(), 
imagepage.site())
-            if  u'other_versions' in match.groupdict():
-                other_versions = 
convertLinks(match.group(u'other_versions').strip(), imagepage.site())
-            # Return the stuff we found
-            return (description, date, source, author)
-    
-    #We didn't find anything, return the empty strings
-    return (description, date, source, author)
+        licensetemplate = self.getNewLicensetemplate(imagepage)
+        categories = self.getNewCategories(imagepage)
+        return (imagepage, description, date, source, author, licensetemplate, 
categories)
 
-def getNewFieldsFromFreetext(imagepage):
-    '''
-    '''
-    text = imagepage.get()
-    #text = re.sub(u'== Summary ==', u'', text, re.IGNORECASE)
-    #text = re.sub(u'== Licensing ==', u'', text, re.IGNORECASE)
-    #text = re.sub(u'\{\{(self|self2)\|[^\}]+\}\}', u'', text, re.IGNORECASE)
+    def getNewFieldsFromInformation(self, imagepage):
+        '''
+        Try to extract fields from the current information template for the 
new information template.
+        '''
+        description = u''
+        date = u''
+        source = u''
+        author = u''
+        permission = u''
+        other_versions = u''
+        text = imagepage.get()
+        # Need to add the permission field
+        # Need to use pywikipedia template parser code
+        regexes 
=[u'\{\{Information[\s\r\n]*\|[\s\r\n]*description[\s\r\n]*=(?P<description>.*)\|[\s\r\n]*source[\s\r\n]*=(?P<source>.*)\|[\s\r\n]*date[\s\r\n]*=(?P<date>.*)\|[\s\r\n]*author[\s\r\n]*=(?P<author>.*)\|[\s\r\n]*permission.*=(?P<permission>[^\}]*)\|[\s\r\n]*other_versions.*=(?P<other_versions>[^\}]*)\}\}',
+                  
u'\{\{Information[\s\r\n]*\|[\s\r\n]*description[\s\r\n]*=(?P<description>.*)\|[\s\r\n]*source[\s\r\n]*=(?P<source>.*)\|[\s\r\n]*date[\s\r\n]*=(?P<date>.*)\|[\s\r\n]*author[\s\r\n]*=(?P<author>.*)\|[\s\r\n]*other_versions.*=(?P<other_versions>[^\}]*)\}\}',
              
+                  ]
+                
+        for regex in regexes:
+            match =re.search(regex, text, re.IGNORECASE|re.DOTALL)
+            if match:
+                description = 
self.convertLinks(match.group(u'description').strip(), imagepage.site())
+                
+                date = match.group(u'date').strip()
+                if date == u'':
+                    date = self.getUploadDate(imagepage)
 
-    for toRemove in sourceGarbage:
-        text = re.sub(toRemove, u'', text, re.IGNORECASE)
-    
-    for (regex, repl) in licenseTemplates:
-        text = re.sub(regex, u'', text, re.IGNORECASE)
+                source = self.getSource(imagepage, 
source=self.convertLinks(match.group(u'source').strip(), imagepage.site()))
 
-    text = wikipedia.removeCategoryLinks(text, imagepage.site()).strip()
+                author = self.convertLinks(match.group(u'author').strip(), 
imagepage.site())
+                if author == u'':
+                    author = self.getAuthorText(imagepage)
+                
+                if u'permission' in match.groupdict():
+                    permission = 
self.convertLinks(match.group(u'permission').strip(), imagepage.site())
+                if  u'other_versions' in match.groupdict():
+                    other_versions = 
self.convertLinks(match.group(u'other_versions').strip(), imagepage.site())
+                # Return the stuff we found
+                return (description, date, source, author)
         
-    description = convertLinks(text.strip(), imagepage.site())
-    date = getUploadDate(imagepage)
-    source = getSource(imagepage)
-    author = getAuthorText(imagepage)
-    return (description, date, source, author)
+        #We didn't find anything, return the empty strings
+        return (description, date, source, author)
 
-def getUploadDate(imagepage):
-    # Get the original upload date
-    uploadtime = imagepage.getFileVersionHistory()[-1][0]
-    uploadDatetime = datetime.strptime(uploadtime, u'%Y-%m-%dT%H:%M:%SZ')
-    return u'{{Date|' + str(uploadDatetime.year) + u'|' + 
str(uploadDatetime.month) + u'|' + str(uploadDatetime.day) + u'}} (original 
upload date)'
+    def getNewFieldsFromFreetext(self, imagepage):
+        '''
+        Try to extract fields from free text for the new information template.
+        '''
+        text = imagepage.get()
+        #text = re.sub(u'== Summary ==', u'', text, re.IGNORECASE)
+        #text = re.sub(u'== Licensing ==', u'', text, re.IGNORECASE)
+        #text = re.sub(u'\{\{(self|self2)\|[^\}]+\}\}', u'', text, 
re.IGNORECASE)
 
-def getSource(imagepage, source=u''):
-    site = imagepage.site()
-    lang = site.language()
-    family = site.family.name
-    if source==u'':
-        source=u'{{Own}}'
+        for toRemove in sourceGarbage:
+            text = re.sub(toRemove, u'', text, re.IGNORECASE)
         
-    return source.strip() + u'<BR />Transferred from 
[http://%(lang)s.%(family)s.org %(lang)s.%(family)s]' % {u'lang' : lang, 
u'family' : family}
+        for (regex, repl) in licenseTemplates:
+            text = re.sub(regex, u'', text, re.IGNORECASE)
 
-def getAuthorText(imagepage):
-    site = imagepage.site()
-    lang = site.language()
-    family = site.family.name
-    
-    firstuploader = getAuthor(imagepage)
-    #FIXME : Make other sites than Wikipedia work
-    return u'[[:%(lang)s:User:%(firstuploader)s|%(firstuploader)s]] at 
[http://%(lang)s.%(family)s.org %(lang)s.%(family)s]' % {u'lang' : lang, 
u'family' : family , u'firstuploader' : firstuploader}
+        text = wikipedia.removeCategoryLinks(text, imagepage.site()).strip()
+            
+        description = self.convertLinks(text.strip(), imagepage.site())
+        date = self.getUploadDate(imagepage)
+        source = self.getSource(imagepage)
+        author = self.getAuthorText(imagepage)
+        return (description, date, source, author)
 
-def getAuthor(imagepage):
-    return imagepage.getFileVersionHistory()[-1][1].strip()
+    def getUploadDate(self, imagepage):
+        '''
+        Get the original upload date to put in the date field of the new 
information template. If we really have nothing better.
+        '''
+        uploadtime = imagepage.getFileVersionHistory()[-1][0]
+        uploadDatetime = datetime.strptime(uploadtime, u'%Y-%m-%dT%H:%M:%SZ')
+        return u'{{Date|' + str(uploadDatetime.year) + u'|' + 
str(uploadDatetime.month) + u'|' + str(uploadDatetime.day) + u'}} (original 
upload date)'
 
-def convertLinks(text, sourceSite):
-    lang = sourceSite.language()
-    family = sourceSite.family.name
-    conversions =[(u'\[\[([^\[\]\|]+)\|([^\[\]\|]+)\]\]', 
u'[[:%(lang)s:\\1|\\2]]'),
-                  (u'\[\[([^\[\]\|]+)\]\]', u'[[:%(lang)s:\\1|\\1]]'),
-                  ]
-    
-    for (regex, replacement) in conversions:
-        text = re.sub(regex, replacement  % {u'lang' : lang, u'family' : 
family}, text)              
+    def getSource(self, imagepage, source=u''):
+        '''
+        Get the text to put in the source field of the new information 
template.
+        '''
+        site = imagepage.site()
+        lang = site.language()
+        family = site.family.name
+        if source==u'':
+            source=u'{{Own}}'
+            
+        return source.strip() + u'<BR />Transferred from 
[http://%(lang)s.%(family)s.org %(lang)s.%(family)s]' % {u'lang' : lang, 
u'family' : family}
 
-    return text
+    def getAuthorText(self, imagepage):
+        '''
+        Get the original uploader to put in the author field of the new 
information template.
+        '''
+        site = imagepage.site()
+        lang = site.language()
+        family = site.family.name
+        
+        firstuploader = self.getAuthor(imagepage)
+        return u'[[:%(lang)s:User:%(firstuploader)s|%(firstuploader)s]] at 
[http://%(lang)s.%(family)s.org %(lang)s.%(family)s]' % {u'lang' : lang, 
u'family' : family , u'firstuploader' : firstuploader}
 
-def getNewLicensetemplate(imagepage):
-    '''
-    '''
-    text = imagepage.get()
-    
-    site = imagepage.site()
-    lang = site.language()
-    family = site.family.name
+    def getAuthor(self, imagepage):
+        '''
+        Get the first uploader.
+        '''
+        return imagepage.getFileVersionHistory()[-1][1].strip()
 
-    result = u''   
-
-    for (regex, replacement) in licenseTemplates:
-        match = re.search(regex, text, re.IGNORECASE)
-        if match:
-            result = re.sub(regex, replacement, match.group(0), re.IGNORECASE)
-            return result % {u'author' : getAuthor(imagepage),
-                             u'lang' : lang,
-                             u'family' : family}
+    def convertLinks(self, text, sourceSite):
+        '''
+        Convert links from the current wiki to Commons.
+        '''
+        lang = sourceSite.language()
+        family = sourceSite.family.name
+        conversions =[(u'\[\[([^\[\]\|]+)\|([^\[\]\|]+)\]\]', 
u'[[:%(lang)s:\\1|\\2]]'),
+                      (u'\[\[([^\[\]\|]+)\]\]', u'[[:%(lang)s:\\1|\\1]]'),
+                      ]
         
-    return result
-    
-def getNewCategories(imagepage):
-    '''
-    Get a categories for the image
-    Dont forget to filter
-    '''
-    result = u''
-    (commonshelperCats, usage, galleries) = 
imagerecat.getCommonshelperCats(imagepage)
-    newcats = imagerecat.applyAllFilters(commonshelperCats)
-    for newcat in newcats:
-        result = result + u'[[Category:' + newcat + u']] '
-    return result
+        for (regex, replacement) in conversions:
+            text = re.sub(regex, replacement  % {u'lang' : lang, u'family' : 
family}, text)              
 
-def getOriginalUploadLog(imagepage):
-    filehistory = imagepage.getFileVersionHistory()
-    filehistory.reverse()
+        return text
 
-    site = imagepage.site()
-    lang = site.language()
-    family = site.family.name
-
-    sourceimage = 
imagepage.site().get_address(imagepage.title()).replace(u'&redirect=no&useskin=monobook',
 u'')
-    
-    result = u'== {{Original upload log}} ==\n'
-    result = result + u'The original description page is/was 
[http://%(lang)s.%(family)s.org%(sourceimage)s here]. All following user names 
refer to %(lang)s.%(family)s.\n' % {u'lang' : lang, u'family' : family , 
u'sourceimage' : sourceimage}
-    for (timestamp, username, resolution, size, comment) in filehistory:
-        date = datetime.strptime(timestamp, 
u'%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d %H:%M')
-        result = result + u'* %(date)s 
[[:%(lang)s:user:%(username)s|%(username)s]] %(resolution)s (%(size)s bytes) 
\'\'<nowiki>%(comment)s</nowiki>\'\'\n' % {
-            u'lang' : lang,
-            u'family' : family ,
-            u'date' : date,
-            u'username' : username,
-            u'resolution': resolution,
-            u'size': size,
-            u'comment' : comment}       
+    def getNewLicensetemplate(self, imagepage):
+        '''
+        Get a license template to put on the image to be uploaded
+        '''
+        text = imagepage.get()
         
-    return result
+        site = imagepage.site()
+        lang = site.language()
+        family = site.family.name
 
-    
+        result = u''   
 
-def buildNewImageDescription(imagepage, description, date, source, author, 
licensetemplate, categories, checkTemplate):
-    '''
-    Build a new information template 
-    '''
-    
-    site = imagepage.site()
-    lang = site.language()
-    family = site.family.name
-    
-    cid = u''
-    if checkTemplate:
-        cid = cid + 
u'\n{{BotMoveToCommons|%(lang)s.%(family)s|year={{subst:CURRENTYEAR}}|month={{subst:CURRENTMONTHNAME}}|day={{subst:CURRENTDAY}}}}\n'
 % {u'lang' : lang, u'family' : family}
-    cid = cid + u'== {{int:filedesc}} ==\n'
-    cid = cid + u'{{Information\n'
-    cid = cid + u'|description={{%(lang)s|1=' % {u'lang' : lang, u'family' : 
family}
-    cid = cid + description + u'}}\n' 
-    cid = cid + u'|date=' + date + u'\n'
-    cid = cid + u'|source=' + source + u'\n'
-    cid = cid + u'|author=' + author + u'\n'
-    cid = cid + u'|permission=\n'
-    cid = cid + u'|other_versions=\n'
-    cid = cid + u'}}\n'
-    cid = cid + u'== {{int:license}} ==\n'
-    cid = cid + licensetemplate + u'\n'
-    cid = cid + u'\n'
-    cid = cid + getOriginalUploadLog(imagepage)
-    cid = cid + u'__NOTOC__\n'
-    if categories.strip()==u'':
-        cid = cid + u'{{Subst:Unc}}'
-    else:
-        cid = cid + categories
-    return cid
-
-
-def processImage(page, checkTemplate):
-    skip = False
-    if page.exists() and (page.namespace() == 6) and (not 
page.isRedirectPage()):
-        imagepage = wikipedia.ImagePage(page.site(), page.title())
-
-        #First do autoskip.
-        if doiskip(imagepage):
-            wikipedia.output(u'Skipping %s : Got a template on the skip list.' 
% page.title())
-            return False
-        
-        text = imagepage.get()
-        foundMatch = False
         for (regex, replacement) in licenseTemplates:
             match = re.search(regex, text, re.IGNORECASE)
             if match:
-                foundMatch = True
-        if not foundMatch:
-            wikipedia.output(u'Skipping %s : No suitable license template was 
found.' % page.title())
-            return False
+                result = re.sub(regex, replacement, match.group(0), 
re.IGNORECASE)
+                return result % {u'author' : self.getAuthor(imagepage),
+                                 u'lang' : lang,
+                                 u'family' : family}
+            
+        return result
         
-        (description, date, source, author, licensetemplate, categories) = 
getNewFields(imagepage)
+    def getNewCategories(self, imagepage):
+        '''
+        Get a categories for the image
+        Dont forget to filter
+        '''
+        result = u''
+        (commonshelperCats, usage, galleries) = 
imagerecat.getCommonshelperCats(imagepage)
+        newcats = imagerecat.applyAllFilters(commonshelperCats)
+        for newcat in newcats:
+            result = result + u'[[Category:' + newcat + u']] '
+        return result
 
+class userInteraction(threading.Thread):
+    '''
+    Prompt all images to the user.
+    '''
+    def __init__ ( self, prefetchQueue, uploadQueue):
+        self.prefetchQueue = prefetchQueue
+        self.uploadQueue = uploadQueue
+        threading.Thread.__init__ ( self )
+
+    def run(self):
         while True:
+            fields = self.prefetchQueue.get()
+            if fields:
+                self.processImage(fields)
+            else:
+                break
+        self.uploadQueue.put(None)
+        wikipedia.output(u'User worked on all images.')
+        return True
+            
+    def processImage(self, fields):
+        '''
+        Work on a single image
+        '''
+        (imagepage, description, date, source, author, licensetemplate, 
categories) = fields
+        while True:
             # Do the Tkdialog to accept/reject and change te name
             (filename, description, date, source, author, licensetemplate, 
categories, skip)=Tkdialog(imagepage, description, date, source, author, 
licensetemplate, categories).getnewmetadata()
 
             if skip:
-                wikipedia.output(u'Skipping %s : User pressed skip.' % 
page.title())
+                wikipedia.output(u'Skipping %s : User pressed skip.' % 
imagepage.title())
                 return False
                    
             # Check if the image already exists
@@ -524,13 +522,115 @@
                 break
             else:
                 wikipedia.output('Image already exists, pick another name or 
skip this image')
-                # We dont overwrite images, pick another name, go to the start 
of the loop   
-        
-        cid = buildNewImageDescription(imagepage, description, date, source, 
author, licensetemplate, categories, checkTemplate)
+                # We dont overwrite images, pick another name, go to the start 
of the loop
+
+        self.uploadQueue.put((imagepage, filename, description, date, source, 
author, licensetemplate, categories))
+
+
+class uploader(threading.Thread):
+    '''
+    Upload all images
+    '''
+    def __init__ ( self, uploadQueue):
+        self.uploadQueue = uploadQueue
+        self.checktemplate = True
+        threading.Thread.__init__ ( self )
+
+    def run(self):
+        while True: #Change later
+            fields = self.uploadQueue.get()
+            if fields:
+                self.processImage(fields)
+            else:
+                break
+        return True
+
+    def nochecktemplate(self):
+        '''
+        Don't want to add {{BotMoveToCommons}}
+        '''
+        self.checktemplate = False
+        return
+    
+    def processImage(self, fields):
+        '''
+        Work on a single image
+        '''
+        (imagepage, filename, description, date, source, author, 
licensetemplate, categories) = fields
+        cid = self.buildNewImageDescription(imagepage, description, date, 
source, author, licensetemplate, categories)
         wikipedia.output(cid)
         bot = UploadRobot(url=imagepage.fileUrl(), description=cid, 
useFilename=filename, keepFilename=True, verifyDescription=False, ignoreWarning 
= True, targetSite = wikipedia.getSite('commons', 'commons'))
         bot.run()
         
+        self.tagNowcommons(imagepage, filename)
+        self.replaceUsage(imagepage, filename)
+       
+
+    def buildNewImageDescription(self, imagepage, description, date, source, 
author, licensetemplate, categories):
+        '''
+        Build a new information template 
+        '''
+        
+        site = imagepage.site()
+        lang = site.language()
+        family = site.family.name
+        
+        cid = u''
+        if self.checktemplate:
+            cid = cid + 
u'\n{{BotMoveToCommons|%(lang)s.%(family)s|year={{subst:CURRENTYEAR}}|month={{subst:CURRENTMONTHNAME}}|day={{subst:CURRENTDAY}}}}\n'
 % {u'lang' : lang, u'family' : family}
+        cid = cid + u'== {{int:filedesc}} ==\n'
+        cid = cid + u'{{Information\n'
+        cid = cid + u'|description={{%(lang)s|1=' % {u'lang' : lang, u'family' 
: family}
+        cid = cid + description + u'}}\n' 
+        cid = cid + u'|date=' + date + u'\n'
+        cid = cid + u'|source=' + source + u'\n'
+        cid = cid + u'|author=' + author + u'\n'
+        cid = cid + u'|permission=\n'
+        cid = cid + u'|other_versions=\n'
+        cid = cid + u'}}\n'
+        cid = cid + u'== {{int:license}} ==\n'
+        cid = cid + licensetemplate + u'\n'
+        cid = cid + u'\n'
+        cid = cid + self.getOriginalUploadLog(imagepage)
+        cid = cid + u'__NOTOC__\n'
+        if categories.strip()==u'':
+            cid = cid + u'{{Subst:Unc}}'
+        else:
+            cid = cid + categories
+        return cid
+
+    def getOriginalUploadLog(self, imagepage):
+        '''
+        Get the original upload log to put at the bottom of the image 
description page at Commons.
+        '''
+        filehistory = imagepage.getFileVersionHistory()
+        filehistory.reverse()
+
+        site = imagepage.site()
+        lang = site.language()
+        family = site.family.name
+
+        sourceimage = 
imagepage.site().get_address(imagepage.title()).replace(u'&redirect=no&useskin=monobook',
 u'')
+        
+        result = u'== {{Original upload log}} ==\n'
+        result = result + u'The original description page is/was 
[http://%(lang)s.%(family)s.org%(sourceimage)s here]. All following user names 
refer to %(lang)s.%(family)s.\n' % {u'lang' : lang, u'family' : family , 
u'sourceimage' : sourceimage}
+        for (timestamp, username, resolution, size, comment) in filehistory:
+            date = datetime.strptime(timestamp, 
u'%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d %H:%M')
+            result = result + u'* %(date)s 
[[:%(lang)s:user:%(username)s|%(username)s]] %(resolution)s (%(size)s bytes) 
\'\'<nowiki>%(comment)s</nowiki>\'\'\n' % {
+                u'lang' : lang,
+                u'family' : family ,
+                u'date' : date,
+                u'username' : username,
+                u'resolution': resolution,
+                u'size': size,
+                u'comment' : comment}       
+            
+        return result
+
+    def tagNowcommons(self, imagepage, filename):
+        '''
+        Tagged the imag which has been moved to Commons for deletion.
+        '''
         if wikipedia.Page(wikipedia.getSite('commons', 'commons'), u'File:' + 
filename).exists():
             #Get a fresh copy, force to get the page so we dont run into edit 
conflicts
             imtxt=imagepage.get(force=True)
@@ -554,27 +654,32 @@
             wikipedia.showDiff(imagepage.get(), imtxt + addTemplate)
             imagepage.put(imtxt + addTemplate, comment = commentText)
 
+    def replaceUsage(self, imagepage, filename):
+        '''
+        If the image is uploaded under a different name, replace all usage.
+        '''
+        if imagepage.titleWithoutNamespace() != filename:
             gen = pagegenerators.FileLinksGenerator(imagepage)
             preloadingGen = pagegenerators.PreloadingGenerator(gen)
-
-            #If the image is uploaded under a different name, replace all 
instances
-            if imagepage.titleWithoutNamespace() != filename:
-                if imagepage.site().language() in imageMoveMessage:
-                    moveSummary = 
imageMoveMessage[imagepage.site().language()] % 
(imagepage.titleWithoutNamespace(), filename)
-                else:
-                    moveSummary = imageMoveMessage['_default'] % 
(imagepage.titleWithoutNamespace(), filename)
-                imagebot = ImageRobot(generator = preloadingGen, oldImage = 
imagepage.titleWithoutNamespace(), newImage = filename, summary = moveSummary, 
always = True, loose = True)
-                imagebot.run()             
+            
+            if imagepage.site().language() in imageMoveMessage:
+                moveSummary = imageMoveMessage[imagepage.site().language()] % 
(imagepage.titleWithoutNamespace(), filename)
+            else:
+                moveSummary = imageMoveMessage['_default'] % 
(imagepage.titleWithoutNamespace(), filename)
+            imagebot = ImageRobot(generator = preloadingGen, oldImage = 
imagepage.titleWithoutNamespace(), newImage = filename, summary = moveSummary, 
always = True, loose = True)
+            imagebot.run()   
     
 
+def main(args):
+    wikipedia.output(u'WARNING: This is an experimental bot')
+    wikipedia.output(u'WARNING: It will only work on self published work 
images')
+    wikipedia.output(u'WARNING: This bot is still full of bugs')
+    wikipedia.output(u'WARNING: Use at your own risk!')
 
-def main(args):
     generator = None;
-    #newname = "";
-    imagepage = None;
     always = False
     checkTemplate = True
-    imagerecat.initLists()
+    
     # Load a lot of default generators
     genFactory = pagegenerators.GeneratorFactory()
 
@@ -590,16 +695,25 @@
 
     pregenerator = pagegenerators.PreloadingGenerator(generator)
 
-    for page in pregenerator:
-        processImage(page, checkTemplate)
+    prefetchQueue = Queue(maxsize=50)
+    uploadQueue = Queue(maxsize=200)
 
+    imageFetcherThread = imageFetcher(pregenerator, prefetchQueue)
+    userInteractionThread = userInteraction(prefetchQueue, uploadQueue)
+    uploaderThread = uploader(uploadQueue)
 
-    wikipedia.output(u'Still ' + str(threading.activeCount()) + u' active 
threads, lets wait')
-    for openthread in threading.enumerate():
-        if openthread != threading.currentThread():
-            openthread.join()
-    wikipedia.output(u'All threads are done')
+    imageFetcherThread.daemon=False
+    userInteractionThread.daemon=False
+    uploaderThread.daemon=False
+    
+    if not checkTemplate:
+        uploaderThread.nochecktemplate()
+    
+    fetchDone = imageFetcherThread.start()
+    userDone = userInteractionThread.start()
+    uploadDone = uploaderThread.start()
 
+
 if __name__ == "__main__":
     try:
         main(sys.argv[1:])



_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn

[Pywikipedia-svn] SVN: [8388] trunk/pywikipedia/imagecopy_enwp.py

Reply via email to