bin/attachment_mimetypes.py              |  157 ++++++++++++++++++++++++++++++
 bin/get-bugzilla-attachments-by-mimetype |  158 -------------------------------
 bin/get-forum-attachments.py             |  106 ++++++++++++++++++++
 3 files changed, 264 insertions(+), 157 deletions(-)

New commits:
commit 8b8e9d3126d4232c6c13e6059ab3542a521251d8
Author:     Xisco Fauli <xiscofa...@libreoffice.org>
AuthorDate: Mon Jun 6 17:28:57 2022 +0200
Commit:     Xisco Fauli <xiscofa...@libreoffice.org>
CommitDate: Tue Jun 7 20:26:13 2022 +0200

    bin: Add script to get attachments from OO forums
    
    Testing it locally, I could download 52.000 documents
    Reuse mimetypes dictionary from get-bugzilla-attachments-by-mimetype
    by putting it into an external file
    
    Change-Id: I875d90f6119c3c3bdfea6a0efd3bbc8c5be1eb63
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/135457
    Tested-by: Jenkins
    Reviewed-by: Xisco Fauli <xiscofa...@libreoffice.org>

diff --git a/bin/attachment_mimetypes.py b/bin/attachment_mimetypes.py
new file mode 100644
index 000000000000..ede5fcb39fb9
--- /dev/null
+++ b/bin/attachment_mimetypes.py
@@ -0,0 +1,157 @@
+mimetypes = {
+# ODF
+    'application/vnd.oasis.opendocument.base': 'odb',
+    'application/vnd.oasis.opendocument.database': 'odb',
+    'application/vnd.oasis.opendocument.chart': 'odc',
+    'application/vnd.oasis.opendocument.chart-template': 'otc',
+    'application/vnd.oasis.opendocument.formula': 'odf',
+    'application/vnd.oasis.opendocument.formula-template': 'otf',
+    'application/vnd.oasis.opendocument.graphics': 'odg',
+    'application/vnd.oasis.opendocument.graphics-template': 'otg',
+    'application/vnd.oasis.opendocument.graphics-flat-xml': 'fodg',
+    'application/vnd.oasis.opendocument.presentation': 'odp',
+    'application/vnd.oasis.opendocument.presentation-template': 'otp',
+    'application/vnd.oasis.opendocument.presentation-flat-xml': 'fodp',
+    'application/vnd.oasis.opendocument.spreadsheet': 'ods',
+    'application/vnd.oasis.opendocument.spreadsheet-template': 'ots',
+    'application/vnd.oasis.opendocument.spreadsheet-flat-xml': 'fods',
+    'application/vnd.oasis.opendocument.text': 'odt',
+    'application/vnd.oasis.opendocument.text-flat-xml': 'fodt',
+    'application/vnd.oasis.opendocument.text-master': 'odm',
+    'application/vnd.oasis.opendocument.text-template': 'ott',
+    'application/vnd.oasis.opendocument.text-master-template': 'otm',
+    'application/vnd.oasis.opendocument.text-web': 'oth',
+# OOo XML
+    'application/vnd.sun.xml.base': 'odb',
+    'application/vnd.sun.xml.calc': 'sxc',
+    'application/vnd.sun.xml.calc.template': 'stc',
+    'application/vnd.sun.xml.chart': 'sxs',
+    'application/vnd.sun.xml.draw': 'sxd',
+    'application/vnd.sun.xml.draw.template': 'std',
+    'application/vnd.sun.xml.impress': 'sxi',
+    'application/vnd.sun.xml.impress.template': 'sti',
+    'application/vnd.sun.xml.math': 'sxm',
+    'application/vnd.sun.xml.writer': 'sxw',
+    'application/vnd.sun.xml.writer.global': 'sxg',
+    'application/vnd.sun.xml.writer.template': 'stw',
+    'application/vnd.sun.xml.writer.web': 'stw',
+# MSO
+    'application/rtf': 'rtf',
+    'text/rtf': 'rtf',
+    'application/msword': 'doc',
+    'application/vnd.ms-powerpoint': 'ppt',
+    'application/vnd.ms-excel': 'xls',
+    'application/vnd.ms-excel.sheet.binary.macroEnabled.12': 'xlsb',
+    'application/vnd.ms-excel.sheet.macroEnabled.12': 'xlsm',
+    'application/vnd.ms-excel.template.macroEnabled.12': 'xltm',
+    'application/vnd.ms-powerpoint.presentation.macroEnabled.12': 'pptm',
+    'application/vnd.ms-powerpoint.slide.macroEnabled.12': 'sldm',
+    'application/vnd.ms-powerpoint.slideshow.macroEnabled.12': 'ppsm',
+    'application/vnd.ms-powerpoint.template.macroEnabled.12': 'potm',
+    'application/vnd.ms-word.document.macroEnabled.12': 'docm',
+    'application/vnd.ms-word.template.macroEnabled.12': 'dotm',
+    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 
'xlsx',
+    'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 
'xltx',
+    
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 
'pptx',
+    'application/vnd.openxmlformats-officedocument.presentationml.template': 
'potx',
+    'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 
'ppsx',
+    'application/vnd.openxmlformats-officedocument.presentationml.slide': 
'sldx',
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 
'docx',
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 
'dotx',
+    'application/vnd.visio': 'vsd',
+    'application/visio.drawing': 'vsd',
+    'application/vnd.visio2013': 'vsdx',
+    'application/vnd.visio.xml': 'vdx',
+    'application/x-mspublisher': 'pub',
+#WPS Office
+    'application/wps-office.doc': 'doc',
+    'application/wps-office.docx': 'docx',
+    'application/wps-office.xls': 'xls',
+    'application/wps-office.xlsx': 'xlsx',
+    'application/wps-office.ppt': 'ppt',
+    'application/wps-office.pptx': 'pptx',
+# W3C
+    'application/xhtml+xml': 'xhtml',
+    'application/mathml+xml': 'mml',
+    'text/html': 'html',
+    'application/docbook+xml': 'docbook',
+# misc
+    'text/csv': 'csv',
+    'text/spreadsheet': 'slk',
+    'application/x-qpro': 'qpro',
+    'application/x-dbase': 'dbf',
+    'application/vnd.corel-draw': 'cdr',
+    'application/vnd.lotus-wordpro': 'lwp',
+    'application/vnd.lotus-1-2-3': 'wks',
+    'application/vnd.wordperfect': 'wpd',
+    'application/wordperfect5.1': 'wpd',
+    'application/vnd.ms-works': 'wps',
+    'application/clarisworks' : 'cwk',
+    'application/macwriteii' : 'mw',
+    'application/vnd.apple.keynote': 'key',
+    'application/vnd.apple.numbers': 'numbers',
+    'application/vnd.apple.pages': 'pages',
+    'application/x-iwork-keynote-sffkey': 'key',
+    'application/x-iwork-numbers-sffnumbers': 'numbers',
+    'application/x-iwork-pages-sffpages': 'pages',
+    'application/x-hwp': 'hwp',
+    'application/x-aportisdoc': 'pdb',
+    'application/prs.plucker' : 'pdb_plucker',
+    'application/vnd.palm' : 'pdb_palm',
+    'application/x-sony-bbeb' : 'lrf',
+    'application/x-pocket-word': 'psw',
+    'application/x-t602': '602',
+    'application/x-fictionbook+xml': 'fb2',
+    'application/x-abiword': 'abw',
+    'application/x-pagemaker': 'pmd',
+    'application/x-gnumeric': 'gnumeric',
+    'application/vnd.stardivision.calc': 'sdc',
+    'application/vnd.stardivision.draw': 'sda',
+    'application/vnd.stardivision.writer': 'sdw',
+    'application/x-starcalc': 'sdc',
+    'application/x-stardraw': 'sdd',
+    'application/x-starwriter': 'sdw',
+# relatively uncommon image mimetypes
+    'image/x-freehand': 'fh',
+    'image/cgm': 'cgm',
+    'image/tif': 'tiff',
+    'image/tiff': 'tiff',
+    'image/vnd.dxf': 'dxf',
+    'image/emf': 'emf',
+    'image/x-emf': 'emf',
+    'image/x-targa': 'tga',
+    'image/x-sgf': 'sgf',
+    'image/x-svm': 'svm',
+    'image/wmf': 'wmf',
+    'image/x-wmf': 'wmf',
+    'image/x-pict': 'pict',
+    'image/x-cmx': 'cmx',
+    'image/svg+xml': 'svg',
+    'image/bmp': 'bmp',
+    'image/x-ms-bmp': 'bmp',
+    'image/x-MS-bmp': 'bmp',
+    'image/x-wpg': 'wpg',
+    'image/x-eps': 'eps',
+    'image/x-met': 'met',
+    'image/x-portable-bitmap': 'pbm',
+    'image/x-photo-cd': 'pcd',
+    'image/x-pcx': 'pcx',
+    'image/x-portable-graymap': 'pgm',
+    'image/x-portable-pixmap': 'ppm',
+    'image/vnd.adobe.photoshop': 'psd',
+    'image/x-cmu-raster': 'ras',
+    'image/x-sun-raster': 'ras',
+    'image/x-xbitmap': 'xbm',
+    'image/x-xpixmap': 'xpm',
+}
+
+# disabled for now, this would download gigs of pngs/jpegs...
+common_noncore_mimetypes = {
+# graphics
+    'image/gif': 'gif',
+    'image/jpeg': 'jpeg',
+    'image/png': 'png',
+# pdf, etc.
+    'application/pdf': 'pdf',
+}
+
diff --git a/bin/get-bugzilla-attachments-by-mimetype 
b/bin/get-bugzilla-attachments-by-mimetype
index a38b6ea95bca..609e6683a0aa 100755
--- a/bin/get-bugzilla-attachments-by-mimetype
+++ b/bin/get-bugzilla-attachments-by-mimetype
@@ -46,6 +46,7 @@ except:
     import xmlrpclib
 from xml.dom import minidom
 from xml.sax.saxutils import escape
+from attachment_mimetypes import mimetypes
 
 def urlopen_retry(url):
     maxretries = 3
@@ -370,163 +371,6 @@ redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
 #system is a nightmare
 novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='
 
-mimetypes = {
-# ODF
-    'application/vnd.oasis.opendocument.base': 'odb',
-    'application/vnd.oasis.opendocument.database': 'odb',
-    'application/vnd.oasis.opendocument.chart': 'odc',
-    'application/vnd.oasis.opendocument.chart-template': 'otc',
-    'application/vnd.oasis.opendocument.formula': 'odf',
-    'application/vnd.oasis.opendocument.formula-template': 'otf',
-    'application/vnd.oasis.opendocument.graphics': 'odg',
-    'application/vnd.oasis.opendocument.graphics-template': 'otg',
-    'application/vnd.oasis.opendocument.graphics-flat-xml': 'fodg',
-    'application/vnd.oasis.opendocument.presentation': 'odp',
-    'application/vnd.oasis.opendocument.presentation-template': 'otp',
-    'application/vnd.oasis.opendocument.presentation-flat-xml': 'fodp',
-    'application/vnd.oasis.opendocument.spreadsheet': 'ods',
-    'application/vnd.oasis.opendocument.spreadsheet-template': 'ots',
-    'application/vnd.oasis.opendocument.spreadsheet-flat-xml': 'fods',
-    'application/vnd.oasis.opendocument.text': 'odt',
-    'application/vnd.oasis.opendocument.text-flat-xml': 'fodt',
-    'application/vnd.oasis.opendocument.text-master': 'odm',
-    'application/vnd.oasis.opendocument.text-template': 'ott',
-    'application/vnd.oasis.opendocument.text-master-template': 'otm',
-    'application/vnd.oasis.opendocument.text-web': 'oth',
-# OOo XML
-    'application/vnd.sun.xml.base': 'odb',
-    'application/vnd.sun.xml.calc': 'sxc',
-    'application/vnd.sun.xml.calc.template': 'stc',
-    'application/vnd.sun.xml.chart': 'sxs',
-    'application/vnd.sun.xml.draw': 'sxd',
-    'application/vnd.sun.xml.draw.template': 'std',
-    'application/vnd.sun.xml.impress': 'sxi',
-    'application/vnd.sun.xml.impress.template': 'sti',
-    'application/vnd.sun.xml.math': 'sxm',
-    'application/vnd.sun.xml.writer': 'sxw',
-    'application/vnd.sun.xml.writer.global': 'sxg',
-    'application/vnd.sun.xml.writer.template': 'stw',
-    'application/vnd.sun.xml.writer.web': 'stw',
-# MSO
-    'application/rtf': 'rtf',
-    'text/rtf': 'rtf',
-    'application/msword': 'doc',
-    'application/vnd.ms-powerpoint': 'ppt',
-    'application/vnd.ms-excel': 'xls',
-    'application/vnd.ms-excel.sheet.binary.macroEnabled.12': 'xlsb',
-    'application/vnd.ms-excel.sheet.macroEnabled.12': 'xlsm',
-    'application/vnd.ms-excel.template.macroEnabled.12': 'xltm',
-    'application/vnd.ms-powerpoint.presentation.macroEnabled.12': 'pptm',
-    'application/vnd.ms-powerpoint.slide.macroEnabled.12': 'sldm',
-    'application/vnd.ms-powerpoint.slideshow.macroEnabled.12': 'ppsm',
-    'application/vnd.ms-powerpoint.template.macroEnabled.12': 'potm',
-    'application/vnd.ms-word.document.macroEnabled.12': 'docm',
-    'application/vnd.ms-word.template.macroEnabled.12': 'dotm',
-    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 
'xlsx',
-    'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 
'xltx',
-    
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 
'pptx',
-    'application/vnd.openxmlformats-officedocument.presentationml.template': 
'potx',
-    'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 
'ppsx',
-    'application/vnd.openxmlformats-officedocument.presentationml.slide': 
'sldx',
-    'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 
'docx',
-    'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 
'dotx',
-    'application/vnd.visio': 'vsd',
-    'application/visio.drawing': 'vsd',
-    'application/vnd.visio2013': 'vsdx',
-    'application/vnd.visio.xml': 'vdx',
-    'application/x-mspublisher': 'pub',
-#WPS Office
-    'application/wps-office.doc': 'doc',
-    'application/wps-office.docx': 'docx',
-    'application/wps-office.xls': 'xls',
-    'application/wps-office.xlsx': 'xlsx',
-    'application/wps-office.ppt': 'ppt',
-    'application/wps-office.pptx': 'pptx',
-# W3C
-    'application/xhtml+xml': 'xhtml',
-    'application/mathml+xml': 'mml',
-    'text/html': 'html',
-    'application/docbook+xml': 'docbook',
-# misc
-    'text/csv': 'csv',
-    'text/spreadsheet': 'slk',
-    'application/x-qpro': 'qpro',
-    'application/x-dbase': 'dbf',
-    'application/vnd.corel-draw': 'cdr',
-    'application/vnd.lotus-wordpro': 'lwp',
-    'application/vnd.lotus-1-2-3': 'wks',
-    'application/vnd.wordperfect': 'wpd',
-    'application/wordperfect5.1': 'wpd',
-    'application/vnd.ms-works': 'wps',
-    'application/clarisworks' : 'cwk',
-    'application/macwriteii' : 'mw',
-    'application/vnd.apple.keynote': 'key',
-    'application/vnd.apple.numbers': 'numbers',
-    'application/vnd.apple.pages': 'pages',
-    'application/x-iwork-keynote-sffkey': 'key',
-    'application/x-iwork-numbers-sffnumbers': 'numbers',
-    'application/x-iwork-pages-sffpages': 'pages',
-    'application/x-hwp': 'hwp',
-    'application/x-aportisdoc': 'pdb',
-    'application/prs.plucker' : 'pdb_plucker',
-    'application/vnd.palm' : 'pdb_palm',
-    'application/x-sony-bbeb' : 'lrf',
-    'application/x-pocket-word': 'psw',
-    'application/x-t602': '602',
-    'application/x-fictionbook+xml': 'fb2',
-    'application/x-abiword': 'abw',
-    'application/x-pagemaker': 'pmd',
-    'application/x-gnumeric': 'gnumeric',
-    'application/vnd.stardivision.calc': 'sdc',
-    'application/vnd.stardivision.draw': 'sda',
-    'application/vnd.stardivision.writer': 'sdw',
-    'application/x-starcalc': 'sdc',
-    'application/x-stardraw': 'sdd',
-    'application/x-starwriter': 'sdw',
-# relatively uncommon image mimetypes
-    'image/x-freehand': 'fh',
-    'image/cgm': 'cgm',
-    'image/tif': 'tiff',
-    'image/tiff': 'tiff',
-    'image/vnd.dxf': 'dxf',
-    'image/emf': 'emf',
-    'image/x-emf': 'emf',
-    'image/x-targa': 'tga',
-    'image/x-sgf': 'sgf',
-    'image/x-svm': 'svm',
-    'image/wmf': 'wmf',
-    'image/x-wmf': 'wmf',
-    'image/x-pict': 'pict',
-    'image/x-cmx': 'cmx',
-    'image/svg+xml': 'svg',
-    'image/bmp': 'bmp',
-    'image/x-ms-bmp': 'bmp',
-    'image/x-MS-bmp': 'bmp',
-    'image/x-wpg': 'wpg',
-    'image/x-eps': 'eps',
-    'image/x-met': 'met',
-    'image/x-portable-bitmap': 'pbm',
-    'image/x-photo-cd': 'pcd',
-    'image/x-pcx': 'pcx',
-    'image/x-portable-graymap': 'pgm',
-    'image/x-portable-pixmap': 'ppm',
-    'image/vnd.adobe.photoshop': 'psd',
-    'image/x-cmu-raster': 'ras',
-    'image/x-sun-raster': 'ras',
-    'image/x-xbitmap': 'xbm',
-    'image/x-xpixmap': 'xpm',
-}
-
-# disabled for now, this would download gigs of pngs/jpegs...
-common_noncore_mimetypes = {
-# graphics
-    'image/gif': 'gif',
-    'image/jpeg': 'jpeg',
-    'image/png': 'png',
-# pdf, etc.
-    'application/pdf': 'pdf',
-}
-
 class manage_threads(threading.Thread):
     def run(self):
         #print(threading.current_thread().get_ident())
diff --git a/bin/get-forum-attachments.py b/bin/get-forum-attachments.py
new file mode 100755
index 000000000000..9b967d5a4963
--- /dev/null
+++ b/bin/get-forum-attachments.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+import magic
+import tempfile
+import os
+import shutil
+from attachment_mimetypes import mimetypes
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# 
https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages
+languages = {
+    'en': "https://forum.openoffice.org/en/forum";,
+    'es': "https://forum.openoffice.org/es/forum";,
+    'fr': "https://forum.openoffice.org/fr/forum";,
+    'hu': "https://forum.openoffice.org/hu/forum";,
+    'it': "https://forum.openoffice.org/it/forum";,
+    'ja': "https://forum.openoffice.org/ja/forum";,
+    'nl': "https://forum.openoffice.org/nl/forum";,
+    'pl': "https://forum.openoffice.org/pl/forum";,
+    'vi': "https://forum.openoffice.org/vi/forum";,
+    'tr': "https://forum.libreoffice.org.tr";,
+    'de': "https://www.openoffice-forum.de";,
+    'de2': "https://www.libreoffice-forum.de";,
+    'de3': "https://de.openoffice.info";,
+}
+
+def get_attachments_from_url(lang, url):
+
+    print("Checking " + url)
+
+    startPoint = 0
+
+    # Keep the index and resume from there
+    indexFile = lang + ".index"
+    if os.path.isfile(indexFile):
+        with open(indexFile) as f:
+            startPoint = int(f.readline().rstrip()) + 1
+    else:
+        if lang == 'hu':
+            startPoint = 1300
+
+    session = requests.Session()
+    retry = Retry(connect=3, backoff_factor=0.5)
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount('http://', adapter)
+    session.mount('https://', adapter)
+
+    invalidCount = 0
+    for i in range(startPoint, 999999):
+        fileUrl = url + "/download/file.php?id=" + str(i)
+        h = session.head(fileUrl)
+        header = h.headers
+        content_type = header.get('content-type')
+        if "html" in content_type:
+            # Let's assume this is an invalid file link
+            invalidCount += 1
+
+            # Let's assume, if we get 100 invalid files, that there are no 
more files
+            if invalidCount == 100:
+                print("No more attachments found in " + url)
+                break
+        else:
+            invalidCount = 0
+
+            if content_type == 'application/octet-stream':
+                r = session.get(fileUrl, allow_redirects=True)
+                with tempfile.NamedTemporaryFile() as tmp:
+                    tmp.write(r.content)
+                    mimetype = magic.from_file(tmp.name, mime=True)
+                    if mimetype in mimetypes:
+                        suffix = mimetypes[mimetype]
+                        try:
+                            os.mkdir(suffix)
+                        except:
+                            pass
+
+                        download = suffix + '/' + "forum-" + lang + '-' + 
str(i) + '.' + suffix
+
+                        print("Downloading as " + download)
+                        shutil.copy(tmp.name, download)
+
+            # Save the index
+            with open(indexFile, 'w') as f:
+                f.write(str(i))
+
+if __name__ == '__main__':
+
+    processes = []
+    # 10 at a time seems to work fine
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        for lang, url in languages.items():
+            processes.append(executor.submit(get_attachments_from_url, lang, 
url))
+
+    for task in as_completed(processes):
+        result = task.result()
+        if result:
+            print(result)

Reply via email to