Prianka has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/194824

Change subject: Port and re-package copyright*.py
......................................................................

Port and re-package copyright*.py

Bug:T66848
Change-Id: Ia0c3a9fe6a2c3be3cdbad517ac9dbf3249c197ab
---
A copyright/exclusion_list.txt
A copyright/site_protected_list.txt
M pywikibot/config2.py
A scripts/copyright/__init__.py
A scripts/copyright/copyright.py
A scripts/copyright/copyright_clean.py
A scripts/copyright/copyright_put.py
7 files changed, 1,933 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/24/194824/1

diff --git a/copyright/exclusion_list.txt b/copyright/exclusion_list.txt
new file mode 100644
index 0000000..0f92ced
--- /dev/null
+++ b/copyright/exclusion_list.txt
@@ -0,0 +1,37 @@
+#
+#
+# You can place here part of URLs that are to reject by
+# exclusion system. Please note, there is an online and
+# active version of this file, automaticaly loaded by
+# copyright.py.
+#
+# You could also consider to report SPAM to:
+#
+#    http://www.google.com/contact/spamreport.html
+#    http://add.yahoo.com/fast/help/us/ysearch/cgi_reportsearchspam
+#
+# Please inform about mirrors and forks of Wikipedia to:
+#
+#    http://en.wikipedia.org/wiki/Wikipedia:Mirrors_and_forks
+#
+# or equivalent page in others Wikipedia according to language
+# used by the clone.
+#
+
+#
+# This let ignore URLs that containing 'wikipedia', 'wikibooks',
+# 'wikiquote'...
+
+wikipedia
+wikibooks
+wikiquote
+wikisource
+wikimedia
+wikinews
+wiktionary
+wikiversity
+
+#
+# Uncomment here if you want to exclude URL containing '.ebay.'
+#.ebay.
+                
diff --git a/copyright/site_protected_list.txt 
b/copyright/site_protected_list.txt
new file mode 100644
index 0000000..2193037
--- /dev/null
+++ b/copyright/site_protected_list.txt
@@ -0,0 +1,7 @@
+britannica.com # On-line encyclopedia (publisher: Encyclopædia Britannica)
+encarta.msn.com # On-line encyclopedia (publisher: Microsoft)
+pbmstoria.it # On-line encycloedia (publisher: Bruno Mondadori)
+sapere.it # On-line encyclopedia (publisher: De Agostini)
+treccani.it # On-line encyclopedia (publisher: Istituto della Enciclopedia 
italiana)
+
+cronologia.it # Amateur site (author: Franco Gonzato)
diff --git a/pywikibot/config2.py b/pywikibot/config2.py
index 0d9d670..fda1f68 100644
--- a/pywikibot/config2.py
+++ b/pywikibot/config2.py
@@ -569,6 +569,7 @@
 
 # ############# SEARCH ENGINE SETTINGS ##############
 
+google_key = ''
 # Some scripts allow using the Yahoo! Search Web Services. To use this feature,
 # you must install the pYsearch module from http://pysearch.sourceforge.net
 # and get a Yahoo AppID from https://developer.yahoo.com/
diff --git a/scripts/copyright/__init__.py b/scripts/copyright/__init__.py
new file mode 100644
index 0000000..573b320
--- /dev/null
+++ b/scripts/copyright/__init__.py
@@ -0,0 +1,2 @@
+# THIS DIRECTORY IS TO HOLD BOT SCRIPTS FOR THE NEW FRAMEWORK
+"""Copyright tool scripts to handle API."""
diff --git a/scripts/copyright/copyright.py b/scripts/copyright/copyright.py
new file mode 100644
index 0000000..f105676
--- /dev/null
+++ b/scripts/copyright/copyright.py
@@ -0,0 +1,1271 @@
+#!/usr/bin/python
+# -*- coding: utf-8  -*-
+"""
+This robot checks copyright text in Google, Yahoo! and Live Search.
+
+Google search requires to install the pyGoogle module from
+http://pygoogle.sf.net and get a Google API license key from
+http://code.google.com/apis/soapsearch/ (but since December 2006 Google is
+no longer issuing new SOAP API keys).
+
+Yahoo! search requires pYsearch module from http://pysearch.sourceforge.net
+and a Yahoo AppID from http://developer.yahoo.com.
+
+Windows Live Search requires to get an AppID from
+http://search.msn.com/developer
+and to download/install the SOAPpy module from http://pywebsvcs.sf.net or using
+SVN with the following command:
+
+svn co 
http://pywebsvcs.svn.sourceforge.net/svnroot/pywebsvcs/trunk/SOAPpy/SOAPpy 
SOAPpy
+
+Unlike SOAPpy version 0.12, current SVN version has no problem with Python 2.5.
+
+
+You can run the bot with the following commandline parameters:
+
+-g           - Use Google search engine
+-ng          - Do not use Google
+-y           - Use Yahoo! search engine
+-ny          - Do not use Yahoo!
+-l           - Use Windows Live Search engine
+-nl          - Do not use Windows Live Search
+-maxquery    - Stop after a specified number of queries for page (default: 25)
+-skipquery   - Skip a number specified of queries
+-output      - Append results to a specified file (default:
+               'copyright/output.txt')
+
+-text:input_text - Work on a specified text
+
+-file        - Work on all pages given in a local text file.
+               Will read any [[wiki link]] and use these articles.
+               Argument can also be given as "-file:filename".
+-new         - Work on the 60 newest pages. If given as -new:x, will work
+               on the x newest pages.
+-cat         - Work on all pages which are in a specific category.
+               Argument can also be given as "-cat:categoryname".
+-subcat      - When the pages to work on have been chosen by -cat, pages in
+               subcategories of the selected category are also included.
+               When -cat has not been selected, this has no effect.
+-page        - Only check a specific page.
+               Argument can also be given as "-page:pagetitle". You can give
+               this parameter multiple times to check multiple pages.
+-ref         - Work on all pages that link to a certain page.
+               Argument can also be given as "-ref:referredpagetitle".
+-filelinks   - Works on all pages that link to a certain image.
+               Argument can also be given as "-filelinks:ImageName".
+-links       - Work on all pages that are linked to from a certain page.
+               Argument can also be given as "-links:linkingpagetitle".
+-start       - Work on all pages in the wiki, starting at a given page.
+-namespace:n - Number or name of namespace to process. The parameter can be 
used
+               multiple times.
+
+Examples:
+
+If you want to check first 50 new articles then use this command:
+
+    python copyright.py -new:50
+
+If you want to check a category with no limit for number of queries to
+request, use this:
+
+    python copyright.py -cat:"Wikipedia featured articles" -maxquery:0
+
+You can include also the text to examine directly on the command line:
+
+    python copyright.py -text:"
+    ...text...
+    "
+"""
+
+#
+# (c) Francesco Cosoleto, 2006
+# (c) Pywikibot team 2006-2015
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+#
+
+import re
+import codecs
+import os
+import time
+import urllib
+import urllib2
+
+import pywikibot
+from pywikibot import pagegenerators
+from pywikibot import config
+
+
+# Search keywords added to all the queries.
+no_result_with_those_words = '-Wikipedia'
+
+# Performing a search engine query if string length is greater than the given
+# value.
+min_query_string_len = 120
+
+# Split the text into strings of a specified number of words.
+number_of_words = 22
+
+# Try to skip quoted text.
+exclude_quote = True
+
+# Enable DOTALL regular expression flag in remove_wikicode() function.
+remove_wikicode_dotall = True
+
+# If ratio between query length and number of commas is greater or equal
+# to 'comma_ratio' then the script identify a comma separated list and
+# don't send data to search engine.
+comma_ratio = 5
+
+# No checks if the page is a disambiguation page.
+skip_disambig = True
+
+# Parameter used in Live Search query.
+# (http://msdn2.microsoft.com/en-us/library/bb266177.aspx)
+region_code = 'en-US'
+
+enable_color = True
+
+warn_color = 'lightyellow'
+error_color = 'lightred'
+
+appdir = "copyright"
+output_file = pywikibot.config.datafilepath(appdir, "output.txt")
+
+pages_for_exclusion_database = [
+    ('it', 'Wikipedia:Sospette violazioni di copyright/Lista di esclusione',
+     'exclusion_list.txt'),
+    ('en', 'Wikipedia:Mirrors_and_forks/Abc', 'Abc.txt'),
+    ('en', 'Wikipedia:Mirrors_and_forks/Def', 'Def.txt'),
+    ('en', 'Wikipedia:Mirrors_and_forks/Ghi', 'Ghi.txt'),
+    ('en', 'Wikipedia:Mirrors_and_forks/Jkl', 'Jkl.txt'),
+    ('en', 'Wikipedia:Mirrors_and_forks/Mno', 'Mno.txt'),
+    ('en', 'Wikipedia:Mirrors_and_forks/Pqr', 'Pqr.txt'),
+    ('en', 'Wikipedia:Mirrors_and_forks/Stu', 'Stu.txt'),
+    ('en', 'Wikipedia:Mirrors_and_forks/Vwxyz', 'Vwxyz.txt'),
+    ('es', 'Wikipedia:Espejos de 
Wikipedia/Espejos_que_cumplen_la_GFDL_y_CC-BY-SA',
+     'Espejos.txt'),
+    ('it', 'Wikipedia:Cloni', 'Cloni.txt'),
+]
+
+reports_cat = {
+    'it': u'Segnalazioni automatiche sospetti problemi di copyright',
+    'es': u'Wikipedia:Páginas para revisar en busca de posible violación de 
copyright',
+}
+
+wikipedia_names = {
+    '--': u'Wikipedia',
+    'am': u'ዊኪፔድያ',
+    'an': u'Biquipedia',
+    'ang': u'Wicipǣdia',
+    'ar': u'ويكيبيديا',
+    'arc': u'ܘܝܟܝܦܕܝܐ',
+    'ast': u'Uiquipedia',
+    'az': u'Vikipediya',
+    'bat-smg': u'Vikipedėjė',
+    'be': u'Вікіпэдыя',
+    'be-x-old': u'Вікіпэдыя',
+    'bg': u'Уикипедия',
+    'bn': u'উইকিপিডিয়া',
+    'bpy': u'উইকিপিডিয়া',
+    'ca': u'Viquipèdia',
+    'ceb': u'Wikipedya',
+    'chr': u'ᏫᎩᏇᏗᏯ',
+    'cr': u'ᐎᑭᐱᑎᔭ',
+    'cs': u'Wikipedie',
+    'csb': u'Wikipedijô',
+    'cu': u'Википедї',
+    'cv': u'Википеди',
+    'cy': u'Wicipedia',
+    'diq': u'Wikipediya',
+    'dv': u'ވިކިޕީޑިއާ',
+    'el': u'Βικιπαίδεια',
+    'eo': u'Vikipedio',
+    'et': u'Vikipeedia',
+    'fa': u'ویکی‌پدیا',
+    'fiu-vro': u'Vikipeediä',
+    'fr': u'Wikipédia',
+    'frp': u'Vuiquipèdia',
+    'fur': u'Vichipedie',
+    'fy': u'Wikipedy',
+    'ga': u'Vicipéid',
+    'gu': u'વિકિપીડિયા',
+    'he': u'ויקיפדיה',
+    'hi': u'विकिपीडिया',
+    'hr': u'Wikipedija',
+    'hsb': u'Wikipedija',
+    'hu': u'Wikipédia',
+    'hy': u'Վիքիփեդիա',
+    'io': u'Wikipedio',
+    'iu': u'ᐅᐃᑭᐱᑎᐊ/oikipitia',
+    'ja': u'ウィキペディア',
+    'jbo': u'uikipedias',
+    'ka': u'ვიკიპედია',
+    'kk': u'Уикипедия',
+    'kn': u'ವಿಕಿಪೀಡಿಯ',
+    'ko': u'위키백과',
+    'ksh': u'Wikkipedija',
+    'la': u'Vicipaedia',
+    'lad': u'ויקיפידיה',
+    'lt': u'Vikipedija',
+    'lv': u'Vikipēdija',
+    'mk': u'Википедија',
+    'ml': u'വിക്കിപീഡിയ',
+    'mo': u'Википедия',
+    'mr': u'विकिपिडीया',
+    'mt': u'Wikipedija',
+    'nah': u'Huiquipedia',
+    'ne': u'विकिपीडिया',
+    'nrm': u'Viqùipédie',
+    'oc': u'Wikipèdia',
+    'os': u'Википеди',
+    'pa': u'ਵਿਕਿਪੀਡਿਆ',
+    'pt': u'Wikipédia',
+    'qu': u'Wikipidiya',
+    'rmy': u'Vikipidiya',
+    'ru': u'Википедия',
+    'sco': u'Wikipaedia',
+    'si': u'විකිපීඩියා',
+    'sk': u'Wikipédia',
+    'sl': u'Wikipedija',
+    'sr': u'Википедија',
+    'su': u'Wikipédia',
+    'ta': u'விக்கிபீடியா',
+    'tg': u'Википедиа',
+    'th': u'วิกิพีเดีย',
+    'tr': u'Vikipedi',
+    'uk': u'Вікіпедія',
+    'uz': u'Vikipediya',
+    'yi': u'‫װיקיפעדיע',
+    'zh': u'维基百科',
+    'zh-classical': u'維基大典',
+    'zh-yue': u'維基百科',
+}
+
+editsection_names = {
+    'ar': u'\[عدل\]',
+    'en': u'\[edit\]',
+    'fa': u'\[ویرایش\]',
+    'fr': u'\[modifier\]',
+    'de': u'\[Bearbeiten\]',
+    'es,pt': u'\[editar\]',
+    'it': u'\[modifica\]',
+    'is': u'\[breyti\]',
+    'ja': u'\[編集\]',
+    'zh': u'\[编辑\]',
+}
+
+sections_to_skip = {
+    'ar': [u'مراجع', u'قراءة أخرى', u'ملاحظات', 
u'وصلات خارجية'],
+    'en': [u'References', u'Further reading', u'Citations', u'External links'],
+    'fa': [u'منابع', u'منابع برای مطالعه بیشتر', 
u'یادکردها',
+           u'پیوند به بیرون'],
+    'es': [u'Referencias', u'Ver también', u'Bibliografía', u'Enlaces 
externos',
+           u'Notas'],
+    'fr': [u'Liens externes'],
+    'it': [u'Bibliografia', u'Discografia', u'Opere bibliografiche',
+           u'Riferimenti bibliografici', u'Collegamenti esterni',
+           u'Pubblicazioni', u'Pubblicazioni principali',
+           u'Bibliografia parziale'],
+    'is': [u'Heimildir', u'Tenglar', u'Tengt efni'],
+    'ja': [u'脚注', u'脚注欄', u'脚注・出典', u'出典', u'注釈'],
+    'zh': [u'參考文獻', u'参考文献', u'參考資料', u'参考资料', 
u'資料來源', u'资料来源',
+           u'參見', u'参见', u'參閱', u'参阅'],
+}
+
+if enable_color:
+    warn_color = '\03{%s}' % warn_color
+    error_color = '\03{%s}' % error_color
+    default_color = '\03{default}'
+else:
+    warn_color = error_color = default_color = ''
+
+site = pywikibot.Site()
+
+
+def _output(text, prefix=None, color=''):
+    if prefix:
+        pywikibot.output('%s%s: %s%s' % (color, prefix, default_color, text))
+    else:
+        pywikibot.output('%s%s' % (color, text))
+
+
+def warn(text, prefix=None):
+    _output(text, prefix=prefix, color=warn_color)
+
+
+def error(text, prefix=None):
+    _output(text, prefix=prefix, color=error_color)
+
+
+def skip_section(text):
+    sect_titles = '|'.join(sections_to_skip[pywikibot.Site().lang])
+    sectC = re.compile('(?mi)^==\s*(' + sect_titles + ')\s*==')
+    while True:
+        newtext = cut_section(text, sectC)
+        if newtext == text:
+            break
+        text = newtext
+    return text
+
+
+def cut_section(text, sectC):
+    sectendC = re.compile('(?m)^==[^=]')
+    start = sectC.search(text)
+    if start:
+        end = sectendC.search(text, start.end())
+        if end:
+            return text[:start.start()] + text[end.start():]
+        else:
+            return text[:start.start()]
+    return text
+
+
+class URLExclusion:
+    def __init__(self):
+        self.URLlist = set()
+        self.scan()
+
+    def pages_list(self):
+        for i in pages_for_exclusion_database:
+            path = pywikibot.config.datafilepath(appdir, i[0], i[2])
+            pywikibot.config.makepath(path)
+            page = pywikibot.Page(pywikibot.Site(i[0]), i[1])
+            yield page, path
+
+    def download(self, force_update=False):
+        for page, path in self.pages_list():
+            download = force_update
+            try:
+                if not os.path.exists(path):
+                    pywikibot.output('Creating file \'%s\' (%s)'
+                                     % (pywikibot.config.shortpath(path),
+                                        page.title(asLink=True)))
+                    download = True
+                else:
+                    file_age = time.time() - os.path.getmtime(path)
+                    if download or file_age > 24 * 60 * 60:
+                        pywikibot.output('Updating file \'%s\' (%s)'
+                                         % (pywikibot.config.shortpath(path),
+                                            page.title(asLink=True)))
+                        download = True
+            except OSError:
+                raise
+
+            if download:
+                data = None
+                try:
+                    data = page.get()
+                except KeyboardInterrupt:
+                    raise
+                except pywikibot.IsRedirectPage:
+                    data = page.getRedirectTarget().get()
+                except:
+                    error('Getting page failed')
+
+                if data:
+                    f = codecs.open(path, 'w', 'utf-8')
+                    f.write(data)
+                    f.close()
+
+    def update(self):
+        self.download(force_update=True)
+        self.scan()
+
+    def check(self, url, verbose=False):
+        for entry in self.URLlist:
+            if entry in url:
+                if verbose > 1:
+                    warn('URL Excluded: %s\nReason: %s' % (url, entry))
+                elif verbose:
+                    warn('URL Excluded: %s' % url)
+                return True
+        return False
+
+    def scan(self):
+        prelist = []
+        result_list = []
+        self.download()
+
+        for page, path in self.pages_list():
+            if 'exclusion_list.txt' in path:
+                result_list += re.sub("</?pre>", "",
+                                      read_file(path,
+                                                cut_comment=True,
+                                                cut_newlines=True)
+                                      ).splitlines()
+            else:
+                data = read_file(path)
+                # wikipedia:en:Wikipedia:Mirrors and forks
+                prelist += 
re.findall("(?i)url\s*=\s*<nowiki>(?:http://)?(.*)</nowiki>",
+                                      data)
+                prelist += 
re.findall("(?i)\*\s*Site:\s*\[?(?:http://)?(.*)\]?",
+                                      data)
+                # wikipedia:it:Wikipedia:Cloni
+                if 'it/Cloni.txt' in path:
+                    prelist += 
re.findall('(?mi)^==(?!=)\s*\[?\s*(?:<nowiki>)?\s*(?:http://)?(.*?)(?:</nowiki>)?\s*\]?\s*==',
+                                          data)
+        list1 = []
+        for entry in prelist:
+            list1 += entry.split(", ")
+        list2 = []
+        for entry in list1:
+            list2 += entry.split("and ")
+        for entry in list2:
+            # Remove unnecessary part of URL
+            entry = re.sub("(http://|www\.)", "", entry)
+            entry = re.sub("</?nowiki>", "", entry)
+            if entry:
+                if '/' in entry:
+                    entry = entry[:entry.rfind('/')]
+
+                entry = re.sub("\s.*", "", entry)
+
+                if len(entry) > 4:
+                    result_list.append(entry)
+
+        result_list += read_file(
+            pywikibot.config.datafilepath(appdir, 'exclusion_list.txt'),
+            cut_comment=True, cut_newlines=True).splitlines()
+
+        for item in result_list:
+            cleaned = item.strip()
+            if cleaned:
+                self.URLlist.add(cleaned)
+
+    def sanity_check(self):
+        pywikibot.output("Exclusion list sanity check...")
+        for entry in self.URLlist:
+            if ('.' not in entry and '/' not in entry) or len(entry) < 5:
+                pywikibot.output("** " + entry)
+
+    def dump(self):
+        f = open(pywikibot.config.datafilepath(appdir, 'exclusion_list.dump'),
+                 'w')
+        f.write('\n'.join(self.URLlist))
+        f.close()
+        pywikibot.output("Exclusion list dump saved.")
+
+
+def read_file(filename, cut_comment=False, cut_newlines=False):
+    text = u""
+    f = codecs.open(filename, 'r', 'utf-8')
+    text = f.read()
+    f.close()
+    if cut_comment:
+        text = re.sub(" ?#.*", "", text)
+    if cut_newlines:
+        text = re.sub("(?m)^\r?\n", "", text)
+    return text
+
+
+def write_log(text, filename=output_file):
+    f = codecs.open(filename, 'a', 'utf-8')
+    f.write(text)
+    f.close()
+
+#
+# Ignore text that contents comma separated list, only numbers,
+# punctuation...
+
+
+def economize_query(text):
+    # Comma separated list
+    c = text.count(', ')
+    if c > 4:
+        l = len(text)
+        r = 100 * float(c) / l
+        if r >= comma_ratio:
+            return True
+
+    # Numbers
+    if re.search('[^0-9\'*/,. +?:;-]{5}', text):
+        return False
+    return True
+
+#
+# Set regex used in remove_wikicode() to remove [[Image:]] tags
+# and regex used in check_in_source() to reject pages with
+# 'Wikipedia'.
+
+
+def join_family_data(reString, namespace):
+    for s in site.namespaces[namespace]:
+        if type(s) == list:
+            for e in s:
+                reString += '|' + e
+        else:
+            reString += '|' + s
+    return '\s*(' + reString + ')\s*'
+
+reImageC = re.compile('\[\[' + join_family_data('Image', 6) + ':.*?\]\]', re.I)
+reWikipediaC = re.compile('(' + '|'.join(wikipedia_names.values()) + ')', re.I)
+reSectionNamesC = re.compile('(' + '|'.join(editsection_names.values()) + ')')
+
+
+def remove_wikicode(text, re_dotall=False, remove_quote=exclude_quote,
+                    debug=False):
+    if not text:
+        return ""
+
+    if debug:
+        write_log(text + '\n', "copyright/wikicode.txt")
+
+    text = re.sub('(?i)</?(p|u|i|b|em|div|span|font|small|big|code|tt).*?>',
+                  '', text)
+    text = re.sub('(?i)<(/\s*)?br(\s*/)?>', '', text)
+    text = re.sub('<!--.*?-->', '', text)
+
+    text = text.replace('&lt;', '<')
+    text = text.replace('&gt;', '>')
+
+    # remove URL
+    text = re.sub('(ftp|https?)://[\w/.,;:@&=%#\\\?_!~*\'|()\"+-]+', ' ', text)
+
+    # remove Image tags
+    text = reImageC.sub("", text)
+
+    # replace piped wikilink
+    text = re.sub("\[\[[^\]]*?\|(.*?)\]\]", "\\1", text)
+
+    # remove unicode and polytonic template
+    text = re.sub("(?i){{(unicode|polytonic)\|(.*?)}}", "\\1", text)
+
+    if re_dotall:
+        flags = "(?xsim)"
+        # exclude wikitable
+        text = re.sub('(?s){\|.*?^\|}', '', text)
+    else:
+        flags = "(?xim)"
+
+    text = re.sub("""
+    %s
+    (
+        <ref[^>]*?\s*/\s*>     | # exclude <ref name = '' / > tags
+        <ref.*?>.*?</ref>      | # exclude <ref> notes
+        ^[\ \t]*({\||[|!]).*?$ | # exclude wikitable
+        </*nowiki>             | # remove <nowiki> tags
+        {{.*?}}                | # remove (not nested) template
+        <math>.*?</math>       | # remove LaTeX staff
+        [\[\]]                 | # remove [, ]
+        ^[*:;]+                | # remove *, :, ; in begin of line
+        <!--                   |
+        -->                    |
+    )
+    """ % flags, "", text)
+
+    if remove_quote:
+        # '' text ''
+        # '' text ''.
+        # '' text '' (text)
+        # « text »
+        # ...
+        #
+
+        italic_quoteC = 
re.compile("(?m)^[:*]?\s*(''.*?'')\.?\s*(\(.*?\))?\r?$")
+
+        index = 0
+        try:
+            import pywikiparser
+        except ImportError:
+            pywikiparser = False
+
+        while pywikiparser:
+            m = italic_quoteC.search(text, index)
+            if not m:
+                break
+
+            s = pywikiparser.Parser(m.group(1))
+
+            try:
+                xmldata = s.parse().toxml()
+                if '<wikipage><p><i>' in xmldata and \
+                   '</i></p></wikipage>' in xmldata:
+                    if xmldata.count('<i>') == 1:
+                        text = text[:m.start()] + text[m.end():]
+            except:
+                pass
+
+            index = m.start() + 1
+
+        text = re.sub('(?m)^[:*]*\s*["][^"]+["]\.?\s*(\(.*?\))?\r?$', "", text)
+        text = re.sub('(?m)^[:*]*\s*[«][^»]+[»]\.?\s*(\(.*?\))?\r?$', "", 
text)
+        text = re.sub('(?m)^[:*]*\s*[“][^”]+[”]\.?\s*(\(.*?\))?\r?$', 
"", text)
+
+    # remove useless spaces
+    text = re.sub("(?m)(^[ \t]+|[ \t]+\r?$)", "", text)
+
+    if debug:
+        write_log(text + '\n', "copyright/wikicode_removed.txt")
+
+    return text
+
+
+def n_index(text, n, sep):
+    pos = 0
+    while n > 0:
+        try:
+            pos = text.index(sep, pos + 1)
+            n -= 1
+        except ValueError:
+            return 0
+    return pos
+
+
+def mysplit(text, dim, sep):
+    if sep not in text:
+        return [text]
+    t = text
+    l = list()
+    while t:
+        if sep in t:
+            n = n_index(t, dim, sep)
+            if n > 0:
+                l.append(t[:n])
+                t = t[n + 1:]
+                continue
+        l.append(t)
+        break
+    return l
+
+
+class SearchEngine:
+
+    num_google_queries = num_yahoo_queries = num_msn_queries = 0
+
+    def __init__(self):
+        self.URLexcl = URLExclusion()
+
+    def __del__(self):
+        self.print_stats()
+
+    def query(self, lines=[], max_query_len=1300, wikicode=True):
+        # Google max_query_len = 1480?
+        # - '-Wikipedia ""' = 1467
+
+        # Google limit queries to 32 words.
+
+        n_query = 0
+        output = unicode()
+        previous_group_url = 'null'
+
+        for line in lines:
+            if wikicode:
+                line = remove_wikicode(line)
+            for search_words in mysplit(line, number_of_words, " "):
+                if len(search_words) > min_query_string_len:
+                    if config.copyright_economize_query:
+                        if economize_query(search_words):
+                            warn(search_words, prefix='Text excluded')
+                            consecutive = False
+                            continue
+                    n_query += 1
+                    #pywikibot.output(search_words)
+                    if config.copyright_max_query_for_page and \
+                       n_query > config.copyright_max_query_for_page:
+                        warn(u"Max query limit for page reached")
+                        return output
+                    if config.copyright_skip_query > n_query:
+                        continue
+                    if len(search_words) > max_query_len:
+                        search_words = search_words[:max_query_len]
+                        consecutive = False
+                        if " " in search_words:
+                            search_words = search_words[
+                                :search_words.rindex(" ")]
+
+                    results = self.get_results(search_words)
+                    group_url = ''
+                    cmp_group_url = ''
+
+                    for url, engine, comment in results:
+                        if comment:
+                            group_url += '\n*%s - %s (%s)' % (engine,
+                                                              url,
+                                                              "; 
".join(comment)
+                                                              )
+                        else:
+                            group_url += '\n*%s - %s' % (engine, url)
+                        cmp_group_url += '\n*%s - %s' % (engine, url)
+                    if results:
+                        group_url_list = group_url.splitlines()
+                        cmp_group_url_list = cmp_group_url.splitlines()
+                        group_url_list.sort()
+                        cmp_group_url_list.sort()
+                        group_url = '\n'.join(group_url_list)
+                        cmp_group_url = '\n'.join(cmp_group_url_list)
+                        if previous_group_url == cmp_group_url:
+                            if consecutive:
+                                output += ' ' + search_words
+                            else:
+                                output += '\n**' + search_words
+                        else:
+                            output += group_url + '\n**' + search_words
+
+                        previous_group_url = cmp_group_url
+                        consecutive = True
+                    else:
+                        consecutive = False
+                else:
+                    consecutive = False
+        return output
+
+    def add_in_urllist(self, url, add_item, engine, cache_url=None):
+
+        check_in_source = (engine == 'google' and
+                           config.copyright_check_in_source_google or
+                           engine == 'yahoo' and
+                           config.copyright_check_in_source_yahoo or
+                           engine == 'msn' and
+                           config.copyright_check_in_source_msn)
+
+        if check_in_source or config.copyright_show_date or \
+           config.copyright_show_length:
+            s = None
+            cache = False
+
+            # list to store date, length, cache URL
+            comment = list()
+
+            try:
+                s = WebPage(add_item, self.URLexcl)
+            except URL_exclusion:
+                pass
+            except NoWebPage:
+                cache = True
+
+            if s:
+                # Before of add url in result list, perform the check in source
+                if check_in_source:
+                    if s.check_in_source():
+                        return
+
+                if config.copyright_show_date:
+                    date = s.lastmodified()
+                    if date:
+                        if date[:3] != time.localtime()[:3]:
+                            comment.append("%s/%s/%s"
+                                           % (date[2], date[1], date[0]))
+
+                unit = 'bytes'
+                if config.copyright_show_length:
+                    length = s.length()
+                    if length > 1024:
+                        # convert in kilobyte
+                        length /= 1024
+                        unit = 'KB'
+                        if length > 1024:
+                            # convert in megabyte
+                            length /= 1024
+                            unit = 'MB'
+                    if length > 0:
+                        comment.append("%d %s" % (length, unit))
+            if cache:
+                if cache_url:
+                    if engine == 'google':
+                        comment.append(
+                            
'[http://www.google.com/search?sourceid=navclient&q=cache:%s Google cache]'
+                            % urllib.quote(short_url(add_item)))
+                    elif engine == 'yahoo':
+                        comment.append("''Yahoo cache''")
+                    elif engine == 'msn':
+                        comment.append('[%s Live cache]'
+                                       % re.sub('&lang=[^&]*', '', cache_url))
+                else:
+                    comment.append('[http://web.archive.org/*/%s archive.org]'
+                                   % short_url(add_item))
+        for i in range(len(url)):
+            if add_item in url[i]:
+                if engine not in url[i][1]:
+                    if url[i][2]:
+                        comment = url[i][2]
+                    url[i] = (add_item, url[i][1] + ', ' + engine, comment)
+                return
+        url.append((add_item, engine, comment))
+        return
+
+    def soap(self, engine, query, url, numresults=10):
+        pywikibot.output("  %s query..." % engine.capitalize())
+        search_request_retry = config.copyright_connection_tries
+        query_success = False
+
+        while search_request_retry:
+            try:
+                if engine == 'google':
+                    import google
+                    google.LICENSE_KEY = config.google_key
+                    data = google.doGoogleSearch('%s "%s"'
+                                                 % (no_result_with_those_words,
+                                                    query))
+                    for entry in data.results:
+                        self.add_in_urllist(url, entry.URL, 'google',
+                                            entry.cachedSize)
+                    self.num_google_queries += 1
+
+                elif engine == 'yahoo':
+                    import yahoo.search.web
+                    data = yahoo.search.web.WebSearch(config.yahoo_appid,
+                                                      query='"%s" %s' % (
+                                                      query.encode('utf_8'),
+                                                      
no_result_with_those_words
+                                                      ), results=numresults)
+                    for entry in data.parse_results():
+                        cacheurl = None
+                        if entry.Cache:
+                            cacheurl = entry.Cache.Url
+                        self.add_in_urllist(url, entry.Url, 'yahoo', cacheurl)
+
+                    self.num_yahoo_queries += 1
+
+                elif engine == 'msn':
+                    #max_query_len = 150?
+                    from SOAPpy import WSDL
+
+                    try:
+                        server = WSDL.Proxy(
+                            'http://soap.search.msn.com/webservices.asmx?wsdl')
+                    except Exception, err:
+                        error("Live Search Error: %s" % err)
+                        raise
+
+                    params = {'AppID': config.msn_appid,
+                              'Query': '%s "%s"' % (no_result_with_those_words,
+                                                    query),
+                              'CultureInfo': region_code,
+                              'SafeSearch': 'Off',
+                              'Requests': {
+                                  'SourceRequest': {'Source': 'Web',
+                                                    'Offset': 0,
+                                                    'Count': 10,
+                                                    'ResultFields': 'All',
+                                                    }}}
+
+                    results = ''
+                    server_results = server.Search(Request=params)
+                    if server_results.Responses[0].Results:
+                        results = server_results.Responses[0].Results[0]
+                    if results:
+                        # list or instance?
+                        if type(results) == list:
+                            for entry in results:
+                                cacheurl = None
+                                if hasattr(entry, 'CacheUrl'):
+                                    cacheurl = entry.CacheUrl
+                                self.add_in_urllist(url, entry.Url, 'msn',
+                                                    cacheurl)
+                        else:
+                            cacheurl = None
+                            if hasattr(results, 'CacheUrl'):
+                                cacheurl = results.CacheUrl
+                            self.add_in_urllist(url, results.Url, 'msn',
+                                                cacheurl)
+                    self.num_msn_queries += 1
+                search_request_retry = 0
+                query_success = True
+            except KeyboardInterrupt:
+                raise
+            except Exception, err:
+                # Something is going wrong...
+                if 'Daily limit' in str(err) or \
+                   'Insufficient quota for key' in str(err):
+                    exceeded_in_queries('google')
+                elif 'limit exceeded' in str(err):
+                    exceeded_in_queries('yahoo')
+                elif 'Invalid value for AppID in request' in str(err):
+                    exceeded_in_queries('msn')
+                else:
+                    error(err, "Got an error")
+
+                if search_request_retry:
+                    search_request_retry -= 1
+
+        if not query_success:
+            error('No response for: %s' % query, "Error (%s)" % engine)
+
+    def get_results(self, query, numresults=10):
+        result_list = list()
+        query = re.sub("[()\"<>]", "", query)
+        pywikibot.output(query)
+        if config.copyright_google:
+            self.soap('google', query, result_list)
+        if config.copyright_yahoo:
+            self.soap('yahoo', query, result_list, numresults=numresults)
+        if config.copyright_msn:
+            self.soap('msn', query, result_list)
+
+        offset = 0
+        for i in range(len(result_list)):
+            if self.URLexcl.check(result_list[i + offset][0], verbose=True):
+                result_list.pop(i + offset)
+                offset += -1
+        return result_list
+
+    def print_stats(self):
+        pywikibot.output('\n'
+                         'Search engine | number of queries\n'
+                         '---------------------------------\n'
+                         'Google        | %s\n'
+                         'Yahoo!        | %s\n'
+                         'Live Search   | %s\n'
+                         % (self.num_google_queries, self.num_yahoo_queries,
+                            self.num_msn_queries))
+
+source_seen = set()
+positive_source_seen = set()
+
+
+class NoWebPage(Exception):
+    """Web page does not exist (404)"""
+
+
+class URL_exclusion(Exception):
+    """URL in exclusion list"""
+
+
+class WebPage(object):
+    """
+    """
+
+    def __init__(self, url, URLExcl):
+        global source_seen
+        self.URLexcludedlist = URLExcl.URLlist
+
+        if url in source_seen or URLExcl.check(url):
+            raise URL_exclusion
+
+        self._url = url
+
+        try:
+            self._urldata = urllib2.urlopen(
+                urllib2.Request(self._url, None,
+                                {'User-Agent': pywikibot.useragent}))
+        except urllib2.HTTPError, err:
+            error("HTTP error: %d / %s (%s)" % (err.code, err.msg, url))
+            if err.code >= 400:
+                source_seen.add(self._url)
+                raise NoWebPage
+            return None
+        except urllib2.URLError, arg:
+            error("URL error: %s / %s" % (url, arg))
+            return None
+        except Exception, err:
+            error("ERROR: %s" % (err))
+
+        self._lastmodified = self._urldata.info().getdate('Last-Modified')
+        self._length = self._urldata.info().getheader('Content-Length')
+        self._content_type = self._urldata.info().getheader('Content-Type')
+
+    def length(self):
+        if hasattr(self, '_length'):
+            if self._length:
+                return int(self._length)
+        if hasattr(self, '_contents'):
+            return len(self._contents)
+
+    def lastmodified(self):
+        if hasattr(self, '_lastmodified'):
+            return self._lastmodified
+
+    def get(self, force=False):
+        # Exclude URL with listed file extension.
+        if self._url[-4:] in [".pdf", ".doc", ".ppt"]:
+            raise URL_exclusion
+
+        # Make sure we did try to get the contents once
+        if not hasattr(self, '_contents'):
+            self._contents = self._urldata.read()
+        return self._contents
+
+    def check_regexp(self, reC, text, filename=None):
+        m = reC.search(text)
+        if m:
+            global positive_source_seen
+            self.URLexcludedlist.add(self._url)
+            positive_source_seen.add(self._url)
+            if filename:
+                write_log("%s (%s)\n" % (self._url, m.group()), filename)
+            return True
+
+    def check_in_source(self):
+        """ Sources may be different from search engine database and include
+        mentions of Wikipedia. This function avoid also errors in search 
results
+        that can occurs either with Google and Yahoo! service.
+
+        """
+        global source_seen
+
+        if not hasattr(self, '_urldata'):
+            return False
+        if self._url in positive_source_seen:
+            return True
+        if self._url in source_seen:
+            return False
+
+        try:
+            text = self.get()
+        except URL_exclusion:
+            return False
+
+        # Character encoding conversion if 'Content-Type' field has
+        # charset attribute set to UTF-8.
+
+        if text:
+            if 'utf-8' in self._content_type.lower():
+                text = text.decode("utf-8", 'replace')
+            else:
+                # <META> declaration with "http-equiv" set to "Content-Type" 
in HTML document.
+                if 'text/html' in self._content_type and (
+                        
re.search("(?is)<meta\s.*?charset\s*=\s*[\"\']*\s*UTF-8.*?>",
+                                  text) or
+                        
re.search("(?is)<\?.*?encoding\s*=\s*[\"\']*\s*UTF-8.*?\?>",
+                                  text)):
+                    text = text.decode("utf-8", 'replace')
+
+            if config.copyright_check_in_source_section_names:
+                if self.check_regexp(reSectionNamesC, text,
+                                     "copyright/sites_with_'[edit]'.txt"):
+                    return True
+
+            if self.check_regexp(reWikipediaC, text,
+                                 "copyright/sites_with_'wikipedia'.txt"):
+                return True
+        source_seen.add(self._url)
+        return False
+
+
+def exceeded_in_queries(engine):
+    """Behavior if an exceeded error occur."""
+
+    # Disable search engine
+    if config.copyright_exceeded_in_queries == 1:
+        exec('config.copyright_' + engine + ' = False')
+    # Sleeping
+    if config.copyright_exceeded_in_queries == 2:
+        error("Got a queries exceeded error from %s. Sleeping for %d hours..."
+              % (engine.capitalize(),
+                 config.copyright_exceeded_in_queries_sleep_hours))
+        time.sleep(config.copyright_exceeded_in_queries_sleep_hours * 3600)
+    # Stop execution
+    if config.copyright_exceeded_in_queries == 3:
+        raise 'Got a queries exceeded error.'
+
+
+def get_by_id(title, id):
+    return pywikibot.Site().getUrl(
+        "/w/index.php?title=%s&oldid=%s&action=raw" % (title, id))
+
+
+def checks_by_ids(self, ids):
+    for title, id in ids:
+        original_text = get_by_id(title, id)
+        if original_text:
+            pywikibot.output(original_text)
+            output = self.query(lines=original_text.splitlines())
+            if output:
+                write_log(
+                    "=== [[" + title + "]] ===\n{{botbox|%s|prev|%s|%s|00}}"
+                    % (title.replace(" ", "_").replace("\"", "%22"),
+                       id, "author")
+                    + output,
+                    pywikibot.config.datafilepath(appdir, "ID_output.txt"))
+
+
+class CheckRobot():
+    def __init__(self, generator):
+        self.generator = generator
+        self.SearchEngine = SearchEngine()
+        self.site = pywikibot.Site()
+
+    def run(self):
+        for page in self.generator:
+            try:
+                original_text = page.get()
+            except pywikibot.NoPage:
+                pywikibot.output(u'Page %s not found' % page.title())
+                continue
+            except pywikibot.IsRedirectPage:
+                newpage = page.getRedirectTarget()
+                pywikibot.output(u'Page %s redirects to \'%s\''
+                                 % (page.title(asLink=True), newpage.title()))
+                bot = CheckRobot(iter([newpage]))
+                bot.run()
+                continue
+            except pywikibot.SectionError:
+                error("Page %s has no section %s"
+                      % (page.title(), page.section()))
+                continue
+
+            if skip_disambig:
+                if page.isDisambig():
+                    pywikibot.output(u'Page %s is a disambiguation page'
+                                     % page.title(asLink=True))
+                    continue
+
+            pywikibot.output(page.title())
+
+            if original_text:
+                text = skip_section(original_text)
+
+                if remove_wikicode_dotall:
+                    text = remove_wikicode(text, re_dotall=True)
+
+                output = self.SearchEngine.query(
+                    lines=text.splitlines(),
+                    wikicode=not remove_wikicode_dotall)
+                if output:
+                    write_log('=== [[%s]] ===%s\n' % (page.title(), output),
+                              filename=output_file)
+
+
+def short_url(url):
+    return url[url.index('://') + 3:]
+
+
+def put(page, text, comment):
+    while True:
+        try:
+            page.put(text, comment=comment)
+            break
+        except pywikibot.SpamfilterError, url:
+            warn(url, prefix="Spam filter")
+            text = re.sub(url[0], '<blacklist>' + short_url(url[0]), text)
+        except pywikibot.EditConflict:
+            warn("Edit conflict")
+            raise pywikibot.EditConflict
+
+
+def check_config(var, license_id, license_name):
+    if var:
+        if not license_id:
+            warn(u"You don't have set a %s, search engine is disabled."
+                 % license_name, prefix="WARNING")
+            return False
+    return var
+
+
+def setSavepath(path):
+    global output_file
+    output_file = path
+
+
+def main(*args):
+    gen = None
+    # pages which will be processed when the -page parameter is used
+    PageTitles = []
+    # IDs which will be processed when the -ids parameter is used
+    ids = None
+    # Which namespaces should be processed?
+    # default to [] which means all namespaces will be processed
+    namespaces = []
+    #
+    repeat = False
+    #
+    text = None
+    # Number of pages to load at a time by Preload generator
+    step = 40
+    # Default number of pages for NewPages generator
+    number = 60
+
+    # This factory is responsible for processing command line arguments
+    # that are also used by other scripts and that determine on which pages
+    # to work on.
+    genFactory = pagegenerators.GeneratorFactory()
+
+    # Read commandline parameters.
+    for arg in pywikibot.handle_args(args):
+        if arg == '-y':
+            config.copyright_yahoo = True
+        elif arg == '-g':
+            config.copyright_google = True
+        elif arg == '-l':
+            config.copyright_msn = True
+        elif arg == '-ny':
+            config.copyright_yahoo = False
+        elif arg == '-ng':
+            config.copyright_google = False
+        elif arg == '-nl':
+            config.copyright_msn = False
+        elif arg.startswith('-output'):
+            if len(arg) >= 8:
+                setSavepath(arg[8:])
+        elif arg.startswith('-maxquery'):
+            if len(arg) >= 10:
+                config.copyright_max_query_for_page = int(arg[10:])
+        elif arg.startswith('-skipquery'):
+            if len(arg) >= 11:
+                config.copyright_skip_query = int(arg[11:])
+        elif arg.startswith('-text'):
+            if len(arg) >= 6:
+                text = arg[6:]
+        elif arg.startswith('-page'):
+            if len(arg) == 5:
+                PageTitles.append(pywikibot.input(
+                    u'Which page do you want to change?'))
+            else:
+                PageTitles.append(arg[6:])
+        elif arg.startswith('-namespace:'):
+            try:
+                namespaces.append(int(arg[11:]))
+            except ValueError:
+                namespaces.append(arg[11:])
+        elif arg.startswith('-forceupdate'):
+            URLExclusion().update()
+        elif arg == '-repeat':
+            repeat = True
+        elif arg.startswith('-new'):
+            if len(arg) >= 5:
+                number = int(arg[5:])
+            gen = pagegenerators.NewpagesPageGenerator(number=number,
+                                                       repeat=repeat)
+            # Preload generator work better if 'step' is not major than
+            # 'number', this avoid unnecessary delay.
+            if number < step:
+                step = number
+        else:
+            genFactory.handleArg(arg)
+
+    if PageTitles:
+        pages = [pywikibot.Page(pywikibot.Site(),
+                                PageTitle) for PageTitle in PageTitles]
+        gen = iter(pages)
+
+    config.copyright_yahoo = check_config(config.copyright_yahoo,
+                                          config.yahoo_appid, "Yahoo AppID")
+    config.copyright_google = check_config(config.copyright_google,
+                                           config.google_key,
+                                           "Google Web API license key")
+    config.copyright_msn = check_config(config.copyright_msn,
+                                        config.msn_appid, "Live Search AppID")
+
+    if ids:
+        checks_by_ids(ids)
+
+    if not gen:
+        gen = genFactory.getCombinedGenerator()
+    if not gen and not ids and not text:
+        # syntax error, show help text from the top of this file
+        pywikibot.output(__doc__, 'utf-8')
+
+    if text:
+        output = SearchEngine().query(lines=text.splitlines())
+        if output:
+            pywikibot.output(output)
+
+    if not gen:
+        return
+    if namespaces:
+        gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
+    preloadingGen = pagegenerators.PreloadingGenerator(gen, step=step)
+    bot = CheckRobot(preloadingGen)
+    bot.run()
+
+if number_of_words > 22 and config.copyright_msn:
+        warn("Live Search requires a lower value for 'number_of_words' "
+             "variable (current value is %d, a good value may be 22)."
+             % (number_of_words), prefix='Warning')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/copyright/copyright_clean.py 
b/scripts/copyright/copyright_clean.py
new file mode 100644
index 0000000..eec4947
--- /dev/null
+++ b/scripts/copyright/copyright_clean.py
@@ -0,0 +1,318 @@
+# -*- coding: utf-8  -*-
+"""
+"""
+
+#
+# (C) Francesco Cosoleto, 2006
+# (c) Pywikibot team 2006-2015
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+#
+
+import re
+
+import pywikibot
+from pywikibot import pagegenerators
+from copyright import mysplit, put, reports_cat, join_family_data
+
+
+summary_msg = {
+    'ar': u'إزالة',
+    'en': u'Removing',
+    'fa': u'حذف',
+    'fr': u'Retiré',
+    'it': u'Rimozione',
+    'ru': u'Удаление',
+    'uk': u'Видалення',
+}
+
+headC = re.compile(
+    "(?m)^=== (?:<strike>)?(?:<s>)?(?:<del>)?\[\[(?::)?(.*?)\]\]")
+separatorC = re.compile('(?m)^== +')
+next_headC = re.compile("(?m)^=+.*?=+")
+
+
+# {{botbox|title|newid|oldid|author|...}}
+rev_templateC = re.compile(
+    "(?m)^(?:{{/t\|.*?}}\n?)?{{(?:/box|botbox)\|.*?\|(.*?)\|")
+
+
+class CopyrightCleanBot(pywikibot.Bot):
+
+    query_results_titles = list()
+    query_results_revids = list()
+
+    def __init__(self, generator):
+        super(CopyrightCleanBot, self).__init__()
+        self.generator = generator
+
+    def query_api(self, data):
+        predata = {
+            'action': 'query',
+            'prop': 'revisions',
+        }
+        predata = self.CombineParams(predata, data)
+        return pywikibot.data.api.Request(**predata).submit()
+
+    def query_old_api(self, data):
+
+        predata = {
+            'what': 'revisions',
+            'rvlimit': '1',
+        }
+        predata = self.CombineParams(predata, data)
+        return pywikibot.data.api.Request(**predata).submit()
+
+    def old_page_exist(self, title):
+        for pageobjs in self.query_results_titles:
+            for key in pageobjs['pages']:
+                if pageobjs['pages'][key]['title'] == title:
+                    if int(key) >= 0:
+                        return True
+        pywikibot.output('* ' + title)
+        return False
+
+    def old_revid_exist(self, revid):
+        for pageobjs in self.query_results_revids:
+            for id in pageobjs['pages']:
+                for rv in range(len(pageobjs['pages'][id]['revisions'])):
+                    if pageobjs['pages'][id]['revisions'][rv]['revid'] == \
+                       int(revid):
+                        # print rv
+                        return True
+        pywikibot.output('* ' + revid)
+        return False
+
+    def page_exist(self, title):
+        for pageobjs in self.query_results_titles:
+            for key in pageobjs['query']['pages']:
+                if pageobjs['query']['pages'][key]['title'] == title:
+                    if 'missing' in pageobjs['query']['pages'][key]:
+                        pywikibot.output('* ' + title)
+                        return False
+        return True
+
+    def revid_exist(self, revid):
+        for pageobjs in self.query_results_revids:
+            if 'badrevids' in pageobjs['query']:
+                for id in pageobjs['query']['badrevids']:
+                    if id == int(revid):
+                        # print rv
+                        pywikibot.output('* ' + revid)
+                        return False
+        return True
+
+    def treat(self, page):
+        data = page.get()
+        pywikibot.output(page.title(asLink=True))
+        output = ''
+
+        #
+        # Preserve text before of the sections
+        #
+
+        m = re.search("(?m)^==\s*[^=]*?\s*==", data)
+        if m:
+            output = data[:m.end() + 1]
+        else:
+            m = re.search("(?m)^===\s*[^=]*?", data)
+            if m:
+                output = data[:m.start()]
+
+        titles = headC.findall(data)
+        titles = [re.sub("#.*", "", item) for item in titles]
+        revids = rev_templateC.findall(data)
+
+        # No more of 50 titles at a time using API
+        for s in mysplit(self.ListToParam(titles), 50, "|"):
+            self.query_results_titles.append(self.query_api({'titles': s}))
+        for s in mysplit(self.ListToParam(revids), 50, "|"):
+            self.query_results_revids.append(self.query_api({'revids': s}))
+
+        comment_entry = list()
+        add_separator = False
+        index = 0
+
+        while True:
+            head = headC.search(data, index)
+            if not head:
+                break
+            index = head.end()
+            title = re.sub("#.*", "", head.group(1))
+            next_head = next_headC.search(data, index)
+            if next_head:
+                if separatorC.search(data[next_head.start():next_head.end()]):
+                    add_separator = True
+                stop = next_head.start()
+            else:
+                stop = len(data)
+
+            exist = True
+            if self.page_exist(title):
+                # check {{botbox}}
+                revid = re.search("{{(?:/box|botbox)\|.*?\|(.*?)\|",
+                                  data[head.end():stop])
+                if revid:
+                    if not self.revid_exist(revid.group(1)):
+                        exist = False
+            else:
+                exist = False
+
+            if exist:
+                ctitle = re.sub(u'(?i)=== \[\[%s:'
+                                % join_family_data('Image', 6),
+                                ur'=== [[:\1:', title)
+                ctitle = re.sub(u'(?i)=== \[\[%s:'
+                                % join_family_data('Category', 14),
+                                ur'=== [[:\1:', ctitle)
+                output += "=== [[" + ctitle + "]]" + data[head.end():stop]
+            else:
+                comment_entry.append("[[%s]]" % title)
+
+            if add_separator:
+                output += data[next_head.start():next_head.end()] + '\n'
+                add_separator = False
+
+        add_comment = u'%s: %s' % (pywikibot.translate(pywikibot.Site(),
+                                                       summary_msg),
+                                   ", ".join(comment_entry))
+
+        # remove useless newlines
+        output = re.sub("(?m)^\n", "", output)
+
+        if comment_entry:
+            pywikibot.output(add_comment)
+            if pywikibot.config.verbose_output:
+                pywikibot.showDiff(page.get(), output)
+
+            choice = pywikibot.inputChoice(u'Do you want to clean the page?',
+                                               ['Yes', 'No'], ['y', 'n'], 'n')
+            if choice == 'y':
+                try:
+                    put(page, output, add_comment)
+                except pywikibot.PageNotSaved:
+                    raise
+
+    pywikibot.stopme()
+
+    #
+    #
+    # Helper utilities
+    #
+    #
+
+    def CleanParams(self, params):
+        """Params may be either a tuple, a list of tuples or a dictionary.
+        This method will convert it into a dictionary
+        """
+        if params is None:
+            return {}
+        pt = type(params)
+        if pt == dict:
+            return params
+        elif pt == tuple:
+            if len(params) != 2:
+                raise "Tuple size must be 2"
+            return {params[0]: params[1]}
+        elif pt == list:
+            for p in params:
+                if p != tuple or len(p) != 2:
+                    raise "Every list element must be a 2 item tuple"
+            return dict(params)
+        else:
+            raise "Unknown param type %s" % pt
+
+    def CombineParams(self, params1, params2):
+        """Merge two dictionaries. If they have the same keys, their values 
will
+        be appended one after another separated by the '|' symbol.
+        """
+
+        params1 = self.CleanParams(params1)
+        if params2 is None:
+            return params1
+        params2 = self.CleanParams(params2)
+
+        for k, v2 in params2.iteritems():
+            if k in params1:
+                v1 = params1[k]
+                if len(v1) == 0:
+                    params1[k] = v2
+                elif len(v2) > 0:
+                    if str in [type(v1), type(v2)]:
+                        raise "Both merged values must be of type 'str'"
+                    params1[k] = v1 + '|' + v2
+                # else ignore
+            else:
+                params1[k] = v2
+        return params1
+
+    def ConvToList(self, item):
+        """Ensure the output is a list
+        """
+        if item is None:
+            return []
+        elif isinstance(item, basestring):
+            return [item]
+        else:
+            return item
+
+    def ListToParam(self, list):
+        """Convert a list of unicode strings into a UTF8 string separated by 
the '|'
+        symbols
+
+        """
+        list = self.ConvToList(list)
+        if len(list) == 0:
+            return ''
+
+        encList = ''
+        # items may not have one symbol - '|'
+        for item in list:
+            if isinstance(item, basestring):
+                if u'|' in item:
+                    raise pywikibot.Error(u"item '%s' contains '|' symbol" % 
item)
+                encList += self.ToUtf8(item) + u'|'
+            elif type(item) == int:
+                encList += self.ToUtf8(item) + u'|'
+            elif isinstance(item, pywikibot.Page):
+                encList += self.ToUtf8(item.title()) + u'|'
+            elif item.__class__.__name__ == 'User':
+                # delay loading this until it is needed
+                encList += self.ToUtf8(item.name()) + u'|'
+            else:
+                raise pywikibot.Error(u'unknown item class %s'
+                                      % item.__class__.__name__)
+
+        # strip trailing '|' before returning
+        return encList[:-1]
+
+    def ToUtf8(self, s):
+        if type(s) != unicode:
+            try:
+                s = unicode(s)
+            except UnicodeDecodeError:
+                s = s.decode(pywikibot.config.console_encoding)
+        return s
+
+
+def main(*args):
+    # Process global args and prepare generator args parser
+    local_args = pywikibot.handle_args(args)
+    genFactory = pagegenerators.GeneratorFactory()
+
+    for arg in local_args:
+        genFactory.handleArg(arg)
+    gen = genFactory.getCombinedGenerator()
+
+    if not gen:
+        cat = pywikibot.Category(pywikibot.Site(), 'Category:%s' %
+                                 pywikibot.translate(pywikibot.Site(),
+                                                     reports_cat))
+        gen = pagegenerators.CategorizedPageGenerator(cat, recurse=True)
+    bot = CopyrightCleanBot(gen)
+    bot.run()
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/copyright/copyright_put.py 
b/scripts/copyright/copyright_put.py
new file mode 100644
index 0000000..7b692bd
--- /dev/null
+++ b/scripts/copyright/copyright_put.py
@@ -0,0 +1,297 @@
+# -*- coding: utf-8  -*-
+"""
+docuReplacements = {‘&params;': pywikibot.pagegenerators.parameterHelp}
+
+"""
+__version__ = '$Id$'
+
+#
+# (C) Francesco Cosoleto, 2006
+#
+# Distributed under the terms of the MIT license.
+#
+
+import re
+import codecs
+import os
+import time
+import shutil
+
+import pywikibot
+from pywikibot import config, date
+from pywikibot import pagegenerators
+
+from copyright import put, join_family_data, appdir, reports_cat
+
+#
+# Month + Year save method (e.g. User:BotName/Report_December_2007)
+append_date_to_wiki_save_path = True
+
+#
+# Append day of month to wiki save path
+# e.g. User:BotName/Report_25_December_2007)
+append_day_to_wiki_save_path = False
+
+#
+# Add pubblication date to entries (template:botdate)
+append_date_to_entries = False
+
+msg_table = {
+    'ar': {'_default': [u'مدخلات جديدة', u'مدخلات 
جديدة']},
+    'en': {'_default': [u'New entries', u'New entries']},
+    'es': {'_default': [u'Entradas nuevas', u'Entradas nuevas']},
+    'fa': {'_default': [u'محتویات جدید', u'محتویات 
جدید']},
+    'it': {'_default': [u'Pagine nuove', u'Nuove voci'],
+           'feed': [u'Aggiunte a voci esistenti', u'Testo aggiunto in']},
+    'ru': {'_default': [u'Новые записи', u'Новые 
записи']},
+    'uk': {'_default': [u'Нові записи', u'Нові записи']},
+}
+
+template_cat = {
+    '_default': [u'This template is used by copyright.py, a script part of 
[[:m:Using the python wikipediabot|PyWikipediaBot]].',
+                 u''],
+    'it': [u'Questo template è usato dallo script copyright.py del [[:m:Using 
the python wikipediabot|PyWikipediaBot]].',
+           u'Template usati da bot'],
+}
+
+stat_msg = {
+    'ar': [u'إحصاءات', u'صفحة', u'مدخلات', u'حجم', u'إجم
الي', 'تحديث'],
+    'en': [u'Statistics', u'Page', u'Entries', u'Size', u'Total', 'Update'],
+    'es': [u'Estadísticas', u'Página', u'Entradas', u'Tamaño', u'Total',
+           u'Actualizacion'],
+    'fa': [u'آمار', u'صفحه', u'محتویات', u'اندازه', u'م
جموع', 'بروزرسانی'],
+    'it': [u'Statistiche', u'Pagina', u'Segnalazioni', u'Lunghezza', u'Totale',
+           u'Ultimo aggiornamento'],
+    'ru': [u'Статистика', u'Страница', u'Записи', u'Р
азмер', u'Всего',
+           u'Изменено'],
+    'uk': [u'Статистика', u'Сторінка', u'Записи', u'Р
озмір', u'Разом',
+           u'Змінено'],
+}
+
+separatorC = re.compile('(?m)^== +')
+
+
+def get_wiki_save_page(stat_page=False):
+
+    site = pywikibot.Site()
+    wiki_save_path = {
+        '_default': u'User:%s/Report' % config.usernames[
+            site.family.name][site.code],
+        'es': u'Usuario:%s/Reporte' % config.usernames[
+            site.family.name][site.code],
+        'it': u'Utente:RevertBot/Report',
+    }
+
+    save_path = pywikibot.translate(site, wiki_save_path, fallback=True)
+    if stat_page:
+        return pywikibot.Page(site,
+                              '%s/%s' % (save_path,
+                                         pywikibot.translate(site,
+                                                             stat_msg)[0]))
+    if append_date_to_wiki_save_path:
+        t = time.localtime()
+        day = ''
+        if append_day_to_wiki_save_path:
+            day = '_' + str(t[2])
+        save_path += '%s_%s_%s' % (day, date.monthName(site.code, t[1]),
+                                   str(t[0]))
+    return pywikibot.Page(site, save_path)
+
+
+def set_template(name=None):
+    site = pywikibot.Site()
+    tcat = pywikibot.translate(site, template_cat)
+    url = "%s://%s%s" % (site.protocol(), site.hostname(), site.path())
+    botdate = u"""
+<div style="text-align:right">{{{1}}}</div><noinclude>%s\n[[%s:%s]]</noinclude>
+""" % (tcat[0], site.namespace(14), tcat[1])
+
+    botbox = """
+<div class=plainlinks 
style="text-align:right">[%s?title={{{1}}}&diff={{{2}}}&oldid={{{3}}} diff] - 
[%s?title={{{1}}}&action=history cron] - [%s?title=Special:Log&page={{{1}}} 
log]</div><noinclude>%s\n[[%s:%s]]</noinclude>
+""" % (url, url, url, tcat[0], site.namespace(14), tcat[1])
+
+    if name == 'botdate':
+        p = pywikibot.Page(site, 'Template:botdate')
+        if not p.exists():
+            p.put(botdate, comment='Init.')
+    if name == 'botbox':
+        p = pywikibot.Page(site, 'Template:botbox')
+        if not p.exists():
+            p.put(botbox, comment='Init.')
+
+
+def stat_sum(engine, text):
+    return len(re.findall('(?im)^\*.*?' + engine + '.*?- ', text))
+
+
+def get_stats():
+    msg = pywikibot.translate(pywikibot.Site(), stat_msg)
+    cat = pywikibot.Category(pywikibot.Site(),
+                             'Category:%s'
+                             % pywikibot.translate(pywikibot.Site(),
+                                                   reports_cat))
+    gen = pagegenerators.CategorizedPageGenerator(cat, recurse=True)
+    output = u"""{| {{prettytable|width=|align=|text-align=left}}
+! %s
+! %s
+! %s
+! %s
+! %s
+! %s
+|-
+""" % (msg[1], msg[2], msg[3], 'Google', 'Yahoo', 'Live Search')
+    gnt, ynt, mnt, ent, sn, snt = 0, 0, 0, 0, 0, 0
+    for page in gen:
+        data = page.get()
+        gn = stat_sum('google', data)
+        yn = stat_sum('yahoo', data)
+        mn = stat_sum('(msn|live)', data)
+        en = len(re.findall('=== \[\[', data))
+        sn = len(data)
+        gnt += gn
+        ynt += yn
+        mnt += mn
+        ent += en
+        snt += sn
+        if en > 0:
+            output += u"|%s||%s||%s KB||%s||%s||%s\n|-\n" \
+                      % (page.title(asLink=True), en, sn / 1024, gn, yn, mn)
+    output += u"""|&nbsp;||||||||
+|-
+|'''%s'''||%s||%s KB||%s||%s||%s
+|-
+|colspan="6" align=right style="background-color:#eeeeee;"|<small>''%s: 
%s''</small>
+|}
+""" % (msg[4], ent, snt / 1024, gnt, ynt, mnt, msg[5],
+       time.strftime("%d " + "%s"
+                     % (date.monthName(pywikibot.Site().language(),
+                                           time.localtime()[1])) + " %Y"))
+    return output
+
+
+def put_stats():
+    page = get_wiki_save_page(stat_page=True)
+    page.put(get_stats(), comment=pywikibot.translate(pywikibot.Site(),
+                                                      stat_msg)[0])
+
+
+def output_files_gen():
+    for f in os.listdir(appdir):
+        if 'output' in f and '_pending' not in f:
+            m = re.search('output_(.*?)\.txt', f)
+            if m:
+                tag = m.group(1)
+            else:
+                tag = '_default'
+            section_name_and_summary = pywikibot.translate(pywikibot.Site(),
+                                                           msg_table)[tag]
+            section = section_name_and_summary[0]
+            summary = section_name_and_summary[1]
+            yield os.path.join(appdir, f), section, summary
+
+
+def read_output_file(filename):
+    if os.path.isfile(filename + '_pending'):
+        shutil.move(filename, filename + '_temp')
+        ap = codecs.open(filename + '_pending', 'a', 'utf-8')
+        ot = codecs.open(filename + '_temp', 'r', 'utf-8')
+        ap.write(ot.read())
+        ap.close()
+        ot.close()
+        os.remove(filename + '_temp')
+    else:
+        shutil.move(filename, filename + '_pending')
+    f = codecs.open(filename + '_pending', 'r', 'utf-8')
+    data = f.read()
+    f.close()
+    return data
+
+
+def run(send_stats=False):
+    page = get_wiki_save_page()
+    try:
+        wikitext = page.get()
+    except pywikibot.NoPage:
+        pywikibot.output("%s not found." % page.title(asLink=True))
+        wikitext = '[[%s:%s]]\n' % (pywikibot.Site().namespace(14),
+                                    pywikibot.translate(pywikibot.Site(),
+                                                        reports_cat))
+    final_summary = u''
+    output_files = list()
+    for f, section, summary in output_files_gen():
+        print (page)
+        pywikibot.output('File: \'%s\'\nSection: %s\n' % (f, section))
+        output_data = read_output_file(f)
+        output_files.append(f)
+        entries = re.findall('=== (.*?) ===', output_data)
+        if entries:
+            if append_date_to_entries:
+                dt = time.strftime('%d-%m-%Y %H:%M', time.localtime())
+                output_data = re.sub("(?m)^(=== \[\[.*?\]\] ===\n)",
+                                     r"\1{{botdate|%s}}\n" % dt, output_data)
+            m = re.search('(?m)^==\s*%s\s*==' % section, wikitext)
+            if m:
+                m_end = re.search(separatorC, wikitext[m.end():])
+                if m_end:
+                    wikitext = (wikitext[:m_end.start() + m.end()] +
+                                output_data + wikitext[m_end.start() + 
m.end():])
+                else:
+                    wikitext += '\n' + output_data
+            else:
+                wikitext += '\n' + output_data
+            if final_summary:
+                final_summary += ' '
+            final_summary += u'%s: %s' % (summary, ', '.join(entries))
+
+    if final_summary:
+        pywikibot.output(final_summary + '\n')
+
+        # if a page in 'Image' or 'Category' namespace is checked then fix
+        # title section by adding ':' in order to avoid wiki code effects.
+        wikitext = re.sub(u'(?i)=== \[\[%s:' % join_family_data('Image', 6),
+                          ur'=== [[:\1:', wikitext)
+        wikitext = re.sub(u'(?i)=== \[\[%s:' % join_family_data('Category', 
14),
+                          ur'=== [[:\1:', wikitext)
+
+        # TODO:
+        # List of frequent rejected address to improve upload process.
+        wikitext = re.sub('http://(.*?)((forumcommunity|forumfree).net)',
+                          r'<blacklist>\1\2', wikitext)
+
+        if len(final_summary) >= 200:
+            final_summary = final_summary[:200]
+            final_summary = final_summary[
+                :final_summary.rindex("[") - 3] + "..."
+
+        try:
+            put(page, wikitext, comment=final_summary)
+            for f in output_files:
+                os.remove(f + '_pending')
+                pywikibot.output("\'%s\' deleted." % f)
+        except pywikibot.PageNotSaved:
+            raise
+
+        if append_date_to_entries:
+            set_template(name='botdate')
+        if '{{botbox' in wikitext:
+            set_template(name='botbox')
+
+    if send_stats:
+        put_stats()
+
+
+def main(*args):
+    # Send statistics
+    send_stats = False
+    genFactory = pagegenerators.GeneratorFactory()
+    for arg in pywikibot.handle_args(args):
+        if arg == "-stats":
+            send_stats = True
+        else:
+            genFactory.handleArg(arg)
+    gen = genFactory.getCombinedGenerator()
+    run(send_stats=send_stats)
+
+
+if __name__ == "__main__":
+    main()

-- 
To view, visit https://gerrit.wikimedia.org/r/194824
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia0c3a9fe6a2c3be3cdbad517ac9dbf3249c197ab
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Prianka <priyankajayaswal...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to