core.git: bin/crashreportScraper.py
bin/crashreportScraper.py | 16 +++- 1 file changed, 7 insertions(+), 9 deletions(-) New commits: commit 8ca321f9e52a43acf36a8f8184f81240bd946653 Author: Xisco Fauli AuthorDate: Thu Mar 14 22:49:19 2024 +0100 Commit: Xisco Fauli CommitDate: Fri Mar 15 12:58:57 2024 +0100 crashreportScraper: fix version comparison And remove version column, it's not very relevant anyway Change-Id: I9101d5f63aec237cdcbfc6eb3759714cca7c5328 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/164849 Tested-by: Jenkins Reviewed-by: Xisco Fauli diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py index 78b2f6cb5eb2..6ce91bcba189 100755 --- a/bin/crashreportScraper.py +++ b/bin/crashreportScraper.py @@ -18,6 +18,7 @@ import os import math from datetime import datetime import urllib.parse +import re def convert_str_to_date(value): value = value.replace('.', '') @@ -73,7 +74,7 @@ def parse_reports_and_get_most_recent_report_from_last_page(url): reports = soup.find("div", {"id": "reports"}).tbody ID, currentID = "", "" -version, currentVersion = "", "" +version, currentVersion = 0, 0 OS, currentOS = "", "" tr_list = reports.find_all("tr") @@ -81,7 +82,7 @@ def parse_reports_and_get_most_recent_report_from_last_page(url): td_list = tr.find_all("td") currentID = td_list[0].a.text.strip() -currentVersion = td_list[2].text.strip().split(': ')[1] +currentVersion = int(''.join(re.findall("\d+", td_list[2].text))) currentOS = td_list[3].text.strip() # get most recent version @@ -91,16 +92,13 @@ def parse_reports_and_get_most_recent_report_from_last_page(url): ID = currentID OS = currentOS -if not version: -version = currentVersion - if not ID: ID = currentID if not OS: OS = currentOS -return count, ID, version, OS +return count, ID, OS def parse_details_and_get_info(url, gitRepo): try: @@ -187,7 +185,7 @@ if __name__ == '__main__': with open(fileName, "a") as f: if bInsertHeader: line = ' '.join(["Name", "Ratio", "Count", "First report", "Last Report", -"ID", "Version", "Reason", "OS", "Stack", "Code Lines", "Last 4 UNO Commands", ' ']) +"ID", "Reason", "OS", "Stack", "Code Lines", "Last 4 UNO Commands", ' ']) f.write(line) f.flush() @@ -195,13 +193,13 @@ if __name__ == '__main__': if k not in crashesInFile: print("Parsing " + k) try: -crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page( +crashCount, crashID, crashOS = parse_reports_and_get_most_recent_report_from_last_page( "https://crashreport.libreoffice.org/stats/signature/; + urllib.parse.quote(k)) crashReason, crashStack, codeLine, unoCommands = parse_details_and_get_info( "https://crashreport.libreoffice.org/stats/crash_details/; + crashID, args.repository) ratio = round(crashCount / ((lDate[2] - lDate[1]).days + 1), 2) line = ' '.join([k, str(ratio), str(crashCount) , lDate[1].strftime('%y/%m/%d'), lDate[2].strftime('%y/%m/%d'), -crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, unoCommands, ' ']) +crashID, crashReason, crashOS, crashStack, codeLine, unoCommands, ' ']) f.write(line) f.flush() except (requests.exceptions.Timeout, AttributeError):
core.git: bin/crashreportScraper.py
bin/crashreportScraper.py | 13 - 1 file changed, 13 deletions(-) New commits: commit 40bab7e27f13becb45055c9cfcd52aebf6128536 Author: Xisco Fauli AuthorDate: Mon Feb 12 11:11:17 2024 +0100 Commit: Xisco Fauli CommitDate: Tue Feb 13 10:33:42 2024 +0100 crashreportScraper: no need to go to the last page anymore JJ changed the way the reports are sorted to show the most recent ones on page 1 Change-Id: I59d566ff45fd8a75263b70a1e436e4263422e93b Reviewed-on: https://gerrit.libreoffice.org/c/core/+/163239 Tested-by: Jenkins Reviewed-by: Xisco Fauli diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py index 876570d3a028..78b2f6cb5eb2 100755 --- a/bin/crashreportScraper.py +++ b/bin/crashreportScraper.py @@ -71,19 +71,6 @@ def parse_reports_and_get_most_recent_report_from_last_page(url): td_list = tr.find_all("td") count += int(td_list[1].text.strip()) -# There are 50 reports on each page. -# Go to the last page based on the total count to get a recent report -last_page = math.ceil( count / 50 ) - -if last_page > 1: -url = url + "?page=" + str(last_page) -try: -html_text = requests.get(url, timeout=200).text -soup = BeautifulSoup(html_text, 'html.parser') -except requests.exceptions.Timeout: -print("Timeout") -raise - reports = soup.find("div", {"id": "reports"}).tbody ID, currentID = "", "" version, currentVersion = "", ""
[Libreoffice-commits] core.git: bin/crashreportScraper.py
bin/crashreportScraper.py |9 - 1 file changed, 4 insertions(+), 5 deletions(-) New commits: commit 55eef24a55afb3708f2e02abf65b6934bed7f3de Author: Xisco Fauli AuthorDate: Tue Sep 5 12:23:20 2023 +0200 Commit: Xisco Fauli CommitDate: Tue Sep 5 16:53:44 2023 +0200 crashreportScraper: Add ratio column it makes sense to sort the sheet by this column Change-Id: I05603dac80289605c18e86fbf27c3d899f9862c2 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/156562 Tested-by: Xisco Fauli Reviewed-by: Xisco Fauli diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py index 54477f6c4c28..876570d3a028 100755 --- a/bin/crashreportScraper.py +++ b/bin/crashreportScraper.py @@ -28,9 +28,7 @@ def convert_str_to_date(value): value = value.replace('Sept', 'Sep') # reset the time leaving the date value = ", ".join(value.split(", ")[:-1]) -dtDate = datetime.strptime(value, '%b %d, %Y') - -return dtDate.strftime('%y/%m/%d') +return datetime.strptime(value, '%b %d, %Y') def parse_version_url(url): crashReports = {} @@ -201,7 +199,7 @@ if __name__ == '__main__': with open(fileName, "a") as f: if bInsertHeader: -line = '\t'.join(["Name", "Count", "First report", "Last Report", +line = '\t'.join(["Name", "Ratio", "Count", "First report", "Last Report", "ID", "Version", "Reason", "OS", "Stack", "Code Lines", "Last 4 UNO Commands", '\n']) f.write(line) f.flush() @@ -214,7 +212,8 @@ if __name__ == '__main__': "https://crashreport.libreoffice.org/stats/signature/; + urllib.parse.quote(k)) crashReason, crashStack, codeLine, unoCommands = parse_details_and_get_info( "https://crashreport.libreoffice.org/stats/crash_details/; + crashID, args.repository) -line = '\t'.join([k, str(crashCount), lDate[1], lDate[2], +ratio = round(crashCount / ((lDate[2] - lDate[1]).days + 1), 2) +line = '\t'.join([k, str(ratio), str(crashCount) , lDate[1].strftime('%y/%m/%d'), lDate[2].strftime('%y/%m/%d'), crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, unoCommands, '\n']) f.write(line) f.flush()
[Libreoffice-commits] core.git: bin/crashreportScraper.py
bin/crashreportScraper.py | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) New commits: commit da39ae2470edac28a65c3a01ddb49a810bec Author: Xisco Fauli AuthorDate: Thu Oct 20 13:18:52 2022 +0200 Commit: Xisco Fauli CommitDate: Fri Oct 21 08:52:26 2022 +0200 crashreportScraper: Also get info about the last 4 uno commands it can be useful Change-Id: I8e709775814922c2623350de1de2fe647d7deadd Reviewed-on: https://gerrit.libreoffice.org/c/core/+/141556 Tested-by: Jenkins Reviewed-by: Xisco Fauli diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py index 7d57ab1f747b..54477f6c4c28 100755 --- a/bin/crashreportScraper.py +++ b/bin/crashreportScraper.py @@ -162,7 +162,14 @@ def parse_details_and_get_info(url, gitRepo): #multiline codeLine = "\"" + codeLine + "\"" -return reason, stack, codeLine +metadata = soup.find("div", {"id": "metadata"}).tbody +tr_list = metadata.find_all("tr") +unoCommands = "" +for tr in tr_list: +if tr.th.text.strip() == "Last-4-Uno-Commands": +unoCommands = tr.td.text.strip() + +return reason, stack, codeLine, unoCommands if __name__ == '__main__': @@ -195,7 +202,7 @@ if __name__ == '__main__': with open(fileName, "a") as f: if bInsertHeader: line = '\t'.join(["Name", "Count", "First report", "Last Report", -"ID", "Version", "Reason", "OS", "Stack", "Code Lines" '\n']) +"ID", "Version", "Reason", "OS", "Stack", "Code Lines", "Last 4 UNO Commands", '\n']) f.write(line) f.flush() @@ -205,10 +212,10 @@ if __name__ == '__main__': try: crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page( "https://crashreport.libreoffice.org/stats/signature/; + urllib.parse.quote(k)) -crashReason, crashStack, codeLine = parse_details_and_get_info( +crashReason, crashStack, codeLine, unoCommands = parse_details_and_get_info( "https://crashreport.libreoffice.org/stats/crash_details/; + crashID, args.repository) line = '\t'.join([k, str(crashCount), lDate[1], lDate[2], -crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, '\n']) +crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, unoCommands, '\n']) f.write(line) f.flush() except (requests.exceptions.Timeout, AttributeError):
[Libreoffice-commits] core.git: bin/crashreportScraper.py
bin/crashreportScraper.py |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) New commits: commit a9a6ec313be4787f99cece793f069f61a8ee73b2 Author: Xisco Fauli AuthorDate: Tue Aug 9 11:50:24 2022 +0200 Commit: Xisco Fauli CommitDate: Tue Aug 9 14:05:40 2022 +0200 crashreportScraper: replace quation marks from code otherwise, the csv is imported incorrectly Change-Id: I5451516b2fdc80a96a4fde83a2c72d701bfd995a Reviewed-on: https://gerrit.libreoffice.org/c/core/+/138009 Tested-by: Jenkins Reviewed-by: Xisco Fauli diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py index b0ab5f5bd5f9..7d57ab1f747b 100755 --- a/bin/crashreportScraper.py +++ b/bin/crashreportScraper.py @@ -149,7 +149,7 @@ def parse_details_and_get_info(url, gitRepo): lines = f.readlines() for index, line in enumerate(lines): if index + 1 == int(codeNumber): -codeLine += line.strip() + "\n" +codeLine += line.strip().replace("\"", "'") + "\n" except FileNotFoundError: codeLine += "\n" continue
[Libreoffice-commits] core.git: bin/crashreportScraper.py
bin/crashreportScraper.py |9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) New commits: commit 452311610d38d7e147b2ec2345b76c1b29646159 Author: Xisco Fauli AuthorDate: Mon Aug 8 19:04:04 2022 +0200 Commit: Xisco Fauli CommitDate: Tue Aug 9 09:51:33 2022 +0200 crashreportScraper: continue when os_tab is not found Change-Id: I293ad70ad2776bfa6ea3e075ba69428963301433 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/137994 Tested-by: Jenkins Reviewed-by: Xisco Fauli diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py index cad7feead645..b0ab5f5bd5f9 100755 --- a/bin/crashreportScraper.py +++ b/bin/crashreportScraper.py @@ -62,7 +62,12 @@ def parse_reports_and_get_most_recent_report_from_last_page(url): raise count = 0 -os_tab = soup.find("table", {"id": "os_tab"}).tbody +try: +os_tab = soup.find("table", {"id": "os_tab"}).tbody +except AttributeError: +print("os_tab not found") +raise + tr_list = os_tab.find_all("tr") for tr in tr_list: td_list = tr.find_all("td") @@ -206,5 +211,5 @@ if __name__ == '__main__': crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, '\n']) f.write(line) f.flush() -except requests.exceptions.Timeout: +except (requests.exceptions.Timeout, AttributeError): continue
[Libreoffice-commits] core.git: bin/crashreportScraper.py
bin/crashreportScraper.py |5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) New commits: commit 6b15decd0acc2c7c0622baba65b9d4c019a4183a Author: Xisco Fauli AuthorDate: Tue Jun 21 15:40:33 2022 +0200 Commit: Xisco Fauli CommitDate: Tue Jun 21 23:51:50 2022 +0200 crashreportScraper: encode the url Change-Id: I1f738f017966a6fe48dd9e2cf36dbdf5f50c0cef Reviewed-on: https://gerrit.libreoffice.org/c/core/+/136229 Tested-by: Xisco Fauli Reviewed-by: Xisco Fauli diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py index 513f5ec7b75d..cad7feead645 100755 --- a/bin/crashreportScraper.py +++ b/bin/crashreportScraper.py @@ -17,6 +17,7 @@ import sys import os import math from datetime import datetime +import urllib.parse def convert_str_to_date(value): value = value.replace('.', '') @@ -194,11 +195,11 @@ if __name__ == '__main__': f.flush() for k, lDate in crashes.items(): -if len(k) < 254 and k not in crashesInFile and '`' not in k: +if k not in crashesInFile: print("Parsing " + k) try: crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page( - "https://crashreport.libreoffice.org/stats/signature/; + k) + "https://crashreport.libreoffice.org/stats/signature/; + urllib.parse.quote(k)) crashReason, crashStack, codeLine = parse_details_and_get_info( "https://crashreport.libreoffice.org/stats/crash_details/; + crashID, args.repository) line = '\t'.join([k, str(crashCount), lDate[1], lDate[2],
[Libreoffice-commits] core.git: bin/crashreportScraper.py
bin/crashreportScraper.py | 20 1 file changed, 12 insertions(+), 8 deletions(-) New commits: commit 970f03cb9ed68d249fe04cff7d4aa15b0f2c6c35 Author: Xisco Fauli AuthorDate: Thu Jun 16 13:16:13 2022 +0200 Commit: Xisco Fauli CommitDate: Thu Jun 16 13:26:36 2022 +0200 crashreportScraper: use argparse to parse the arguments Change-Id: Idc1d32683c5113042fe4e7ec97357b6d76c5217e Reviewed-on: https://gerrit.libreoffice.org/c/core/+/135973 Tested-by: Xisco Fauli Reviewed-by: Xisco Fauli diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py index aec3e5e9cffb..513f5ec7b75d 100755 --- a/bin/crashreportScraper.py +++ b/bin/crashreportScraper.py @@ -8,8 +8,9 @@ # Use this script to retrieve information from https://crashreport.libreoffice.org # about a specific version of LibreOffice -# Usage sample: ./crashreportScraper.py 7.2.0.4 +# Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/ +import argparse import requests from bs4 import BeautifulSoup import sys @@ -160,17 +161,20 @@ def parse_details_and_get_info(url, gitRepo): if __name__ == '__main__': -version = sys.argv[1] +parser = argparse.ArgumentParser() -crashes = parse_version_url( -"https://crashreport.libreoffice.org/stats/version/; + version + "?limit=1000=30") +parser.add_argument('--version', action='store', dest="version", required=True) +parser.add_argument('--repository', action="store", dest="repository", required=True) + +args = parser.parse_args() -gitRepo = os.path.dirname(os.path.realpath(__file__)) + "/../" +crashes = parse_version_url( +"https://crashreport.libreoffice.org/stats/version/; + args.version + "?limit=1000=30") -print(str(len(crashes)) + " crash reports in version " + version) +print(str(len(crashes)) + " crash reports in version " + args.version) crashesInFile = [] -fileName = "crashes_" + version.replace(".", "_") + ".csv" +fileName = "crashes_" + args.version.replace(".", "_") + ".csv" print("Using " + fileName) bInsertHeader = False @@ -196,7 +200,7 @@ if __name__ == '__main__': crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page( "https://crashreport.libreoffice.org/stats/signature/; + k) crashReason, crashStack, codeLine = parse_details_and_get_info( - "https://crashreport.libreoffice.org/stats/crash_details/; + crashID, gitRepo) + "https://crashreport.libreoffice.org/stats/crash_details/; + crashID, args.repository) line = '\t'.join([k, str(crashCount), lDate[1], lDate[2], crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, '\n']) f.write(line)
[Libreoffice-commits] core.git: bin/crashreportScraper.py
bin/crashreportScraper.py | 17 ++--- 1 file changed, 6 insertions(+), 11 deletions(-) New commits: commit 3cb921dece44e3e289fc73a64399c1a6c618259c Author: tagezi AuthorDate: Fri May 27 21:25:13 2022 +0300 Commit: Xisco Fauli CommitDate: Thu Jun 2 15:52:20 2022 +0200 crashreportScraper: Removed time parsing, now it just resets. Change-Id: I39465cdbc14e28556760a0c1feab22d8998e4d16 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/135050 Tested-by: Jenkins Reviewed-by: Xisco Fauli diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py index 1735aa3052b5..aec3e5e9cffb 100755 --- a/bin/crashreportScraper.py +++ b/bin/crashreportScraper.py @@ -24,15 +24,11 @@ def convert_str_to_date(value): value = value.replace('June', 'Jun') value = value.replace('July', 'Jul') value = value.replace('Sept', 'Sep') -value = value.replace('noon', '12:00 pm') +# reset the time leaving the date +value = ", ".join(value.split(", ")[:-1]) +dtDate = datetime.strptime(value, '%b %d, %Y') -if ':' not in value: -if 'am' in value: -value = value.replace(' am', ':00 am') -elif 'pm' in value: -value = value.replace(' pm', ':00 pm') - -return datetime.strptime(value, '%b %d, %Y, %H:%M %p') +return dtDate.strftime('%y/%m/%d') def parse_version_url(url): crashReports = {} @@ -193,7 +189,7 @@ if __name__ == '__main__': f.write(line) f.flush() -for k, v in crashes.items(): +for k, lDate in crashes.items(): if len(k) < 254 and k not in crashesInFile and '`' not in k: print("Parsing " + k) try: @@ -201,10 +197,9 @@ if __name__ == '__main__': "https://crashreport.libreoffice.org/stats/signature/; + k) crashReason, crashStack, codeLine = parse_details_and_get_info( "https://crashreport.libreoffice.org/stats/crash_details/; + crashID, gitRepo) -line = '\t'.join([k, str(crashCount), v[1].strftime('%y/%m/%d'), v[2].strftime('%y/%m/%d'), +line = '\t'.join([k, str(crashCount), lDate[1], lDate[2], crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, '\n']) f.write(line) f.flush() except requests.exceptions.Timeout: continue -
[Libreoffice-commits] core.git: bin/crashreportScraper.py
bin/crashreportScraper.py | 56 ++ 1 file changed, 37 insertions(+), 19 deletions(-) New commits: commit c5ca8b5bffc3b08f74817a3dee2c314b7ced Author: Xisco Fauli AuthorDate: Tue May 31 11:08:21 2022 +0200 Commit: Xisco Fauli CommitDate: Tue May 31 15:43:36 2022 +0200 crashreportScraper: use timeout in requests Change-Id: I03f8740fc124c11d250368034bf6e14239df5abe Reviewed-on: https://gerrit.libreoffice.org/c/core/+/135180 Tested-by: Xisco Fauli Reviewed-by: Xisco Fauli diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py index aedb7a666c06..1735aa3052b5 100755 --- a/bin/crashreportScraper.py +++ b/bin/crashreportScraper.py @@ -36,8 +36,13 @@ def convert_str_to_date(value): def parse_version_url(url): crashReports = {} -html_text = requests.get(url).text -soup = BeautifulSoup(html_text, 'html.parser') + +try: +html_text = requests.get(url, timeout=200).text +soup = BeautifulSoup(html_text, 'html.parser') +except requests.exceptions.Timeout: +print("Timeout requesting " + url) +sys.exit(1) table = soup.find("table", {"id": "data-table"}).tbody for tr in table.find_all("tr"): @@ -51,8 +56,12 @@ def parse_version_url(url): return crashReports def parse_reports_and_get_most_recent_report_from_last_page(url): -html_text = requests.get(url).text -soup = BeautifulSoup(html_text, 'html.parser') +try: +html_text = requests.get(url, timeout=200).text +soup = BeautifulSoup(html_text, 'html.parser') +except requests.exceptions.Timeout: +print("Timeout") +raise count = 0 os_tab = soup.find("table", {"id": "os_tab"}).tbody @@ -67,8 +76,12 @@ def parse_reports_and_get_most_recent_report_from_last_page(url): if last_page > 1: url = url + "?page=" + str(last_page) -html_text = requests.get(url).text -soup = BeautifulSoup(html_text, 'html.parser') +try: +html_text = requests.get(url, timeout=200).text +soup = BeautifulSoup(html_text, 'html.parser') +except requests.exceptions.Timeout: +print("Timeout") +raise reports = soup.find("div", {"id": "reports"}).tbody ID, currentID = "", "" @@ -102,8 +115,12 @@ def parse_reports_and_get_most_recent_report_from_last_page(url): return count, ID, version, OS def parse_details_and_get_info(url, gitRepo): -html_text = requests.get(url).text -soup = BeautifulSoup(html_text, 'html.parser') +try: +html_text = requests.get(url, timeout=200).text +soup = BeautifulSoup(html_text, 'html.parser') +except requests.exceptions.Timeout: +print("Timeout") +raise details = soup.find("div", {"id": "details"}).tbody tr_list = details.find_all("tr") @@ -177,16 +194,17 @@ if __name__ == '__main__': f.flush() for k, v in crashes.items(): -# ignore unresolved crash signatures -if len(k) < 254 and k not in crashesInFile and '`' not in k and not k.lower().endswith('.dll') and \ -not k.lower().endswith('.so') and ".so." not in k.lower(): +if len(k) < 254 and k not in crashesInFile and '`' not in k: print("Parsing " + k) -crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page( -"https://crashreport.libreoffice.org/stats/signature/; + k) -crashReason, crashStack, codeLine = parse_details_and_get_info( - "https://crashreport.libreoffice.org/stats/crash_details/; + crashID, gitRepo) -line = '\t'.join([k, str(crashCount), v[1].strftime('%y/%m/%d'), v[2].strftime('%y/%m/%d'), -crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, '\n']) -f.write(line) -f.flush() +try: +crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page( + "https://crashreport.libreoffice.org/stats/signature/; + k) +crashReason, crashStack, codeLine = parse_details_and_get_info( + "https://crashreport.libreoffice.org/stats/crash_details/; + crashID, gitRepo) +line = '\t'.join([k, str(crashCount), v[1].strftime('%y/%m/%d'), v[2].strftime('%y/%m/%d'), +crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, '\n']) +f.write(line) +f.flush() +except requests.exceptions.Timeout: +continue
[Libreoffice-commits] core.git: bin/crashreportScraper.py
bin/crashreportScraper.py | 33 +++-- 1 file changed, 27 insertions(+), 6 deletions(-) New commits: commit bf6c74f1bb4ba67c16d442a9d8847118891ec89e Author: Xisco Fauli AuthorDate: Mon May 30 20:56:41 2022 +0200 Commit: Xisco Fauli CommitDate: Tue May 31 11:10:00 2022 +0200 crashreportScraper: Add new column to show the stack code Change-Id: Id6f9ed8540a8615a80de9cc561579ce069992e85 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/135142 Tested-by: Jenkins Reviewed-by: Xisco Fauli diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py index 780db5a9dc2c..aedb7a666c06 100755 --- a/bin/crashreportScraper.py +++ b/bin/crashreportScraper.py @@ -101,7 +101,7 @@ def parse_reports_and_get_most_recent_report_from_last_page(url): return count, ID, version, OS -def parse_details_and_get_info(url): +def parse_details_and_get_info(url, gitRepo): html_text = requests.get(url).text soup = BeautifulSoup(html_text, 'html.parser') @@ -110,6 +110,8 @@ def parse_details_and_get_info(url): reason = tr_list[8].td.text.strip() stack = "" +codeLine = "" + count = 0 frames = soup.find("div", {"id": "frames"}).tbody for tr in frames.find_all("tr"): @@ -120,10 +122,27 @@ def parse_details_and_get_info(url): stack += source + "\n" count += 1 +codeFile = source.split(":")[0] +codeNumber = source.split(":")[1] +try: +with open(os.path.join(gitRepo, codeFile)) as f: +lines = f.readlines() +for index, line in enumerate(lines): +if index + 1 == int(codeNumber): +codeLine += line.strip() + "\n" +except FileNotFoundError: +codeLine += "\n" +continue + if stack: #multiline stack = "\"" + stack + "\"" -return reason, stack + +if codeLine: +#multiline +codeLine = "\"" + codeLine + "\"" + +return reason, stack, codeLine if __name__ == '__main__': @@ -133,6 +152,8 @@ if __name__ == '__main__': crashes = parse_version_url( "https://crashreport.libreoffice.org/stats/version/; + version + "?limit=1000=30") +gitRepo = os.path.dirname(os.path.realpath(__file__)) + "/../" + print(str(len(crashes)) + " crash reports in version " + version) crashesInFile = [] @@ -151,7 +172,7 @@ if __name__ == '__main__': with open(fileName, "a") as f: if bInsertHeader: line = '\t'.join(["Name", "Count", "First report", "Last Report", -"ID", "Version", "Reason", "OS", "Stack", '\n']) +"ID", "Version", "Reason", "OS", "Stack", "Code Lines" '\n']) f.write(line) f.flush() @@ -162,10 +183,10 @@ if __name__ == '__main__': print("Parsing " + k) crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page( "https://crashreport.libreoffice.org/stats/signature/; + k) -crashReason, crashStack = parse_details_and_get_info( - "https://crashreport.libreoffice.org/stats/crash_details/; + crashID) +crashReason, crashStack, codeLine = parse_details_and_get_info( + "https://crashreport.libreoffice.org/stats/crash_details/; + crashID, gitRepo) line = '\t'.join([k, str(crashCount), v[1].strftime('%y/%m/%d'), v[2].strftime('%y/%m/%d'), -crashID, crashVersion, crashReason, crashOS, crashStack, '\n']) +crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, '\n']) f.write(line) f.flush()
[Libreoffice-commits] core.git: bin/crashreportScraper.py
bin/crashreportScraper.py | 171 ++ 1 file changed, 171 insertions(+) New commits: commit e09f49f944fa1d4163bfd52fd824f4216f93558f Author: Xisco Fauli AuthorDate: Fri May 13 15:30:23 2022 +0200 Commit: Xisco Fauli CommitDate: Thu May 26 13:07:10 2022 +0200 bin: add script to retrieve info about crashreports... ... from https://crashreport.libreoffice.org The script saves the data into a .csv file Change-Id: I771d144402a3039851c99b025c52bd6e799f71ec Reviewed-on: https://gerrit.libreoffice.org/c/core/+/134283 Tested-by: Jenkins Reviewed-by: Xisco Fauli diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py new file mode 100755 index ..780db5a9dc2c --- /dev/null +++ b/bin/crashreportScraper.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# Use this script to retrieve information from https://crashreport.libreoffice.org +# about a specific version of LibreOffice +# Usage sample: ./crashreportScraper.py 7.2.0.4 + +import requests +from bs4 import BeautifulSoup +import sys +import os +import math +from datetime import datetime + +def convert_str_to_date(value): +value = value.replace('.', '') +value = value.replace('March', 'Mar') +value = value.replace('April', 'Apr') +value = value.replace('June', 'Jun') +value = value.replace('July', 'Jul') +value = value.replace('Sept', 'Sep') +value = value.replace('noon', '12:00 pm') + +if ':' not in value: +if 'am' in value: +value = value.replace(' am', ':00 am') +elif 'pm' in value: +value = value.replace(' pm', ':00 pm') + +return datetime.strptime(value, '%b %d, %Y, %H:%M %p') + +def parse_version_url(url): +crashReports = {} +html_text = requests.get(url).text +soup = BeautifulSoup(html_text, 'html.parser') + +table = soup.find("table", {"id": "data-table"}).tbody +for tr in table.find_all("tr"): +td_list = tr.find_all("td") +crashName = td_list[0].a.text.strip() +crashNumber = int(td_list[1].text.strip()) +firstCrashDate = convert_str_to_date(td_list[5].text.strip()) +lastCrashDate = convert_str_to_date(td_list[6].text.strip()) +crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate] + +return crashReports + +def parse_reports_and_get_most_recent_report_from_last_page(url): +html_text = requests.get(url).text +soup = BeautifulSoup(html_text, 'html.parser') + +count = 0 +os_tab = soup.find("table", {"id": "os_tab"}).tbody +tr_list = os_tab.find_all("tr") +for tr in tr_list: +td_list = tr.find_all("td") +count += int(td_list[1].text.strip()) + +# There are 50 reports on each page. +# Go to the last page based on the total count to get a recent report +last_page = math.ceil( count / 50 ) + +if last_page > 1: +url = url + "?page=" + str(last_page) +html_text = requests.get(url).text +soup = BeautifulSoup(html_text, 'html.parser') + +reports = soup.find("div", {"id": "reports"}).tbody +ID, currentID = "", "" +version, currentVersion = "", "" +OS, currentOS = "", "" + +tr_list = reports.find_all("tr") +for tr in tr_list: +td_list = tr.find_all("td") + +currentID = td_list[0].a.text.strip() +currentVersion = td_list[2].text.strip().split(': ')[1] +currentOS = td_list[3].text.strip() + +# get most recent version +# symbols on linux are not very informative generally +if currentOS == "windows" and currentVersion > version: +version = currentVersion +ID = currentID +OS = currentOS + +if not version: +version = currentVersion + +if not ID: +ID = currentID + +if not OS: +OS = currentOS + +return count, ID, version, OS + +def parse_details_and_get_info(url): +html_text = requests.get(url).text +soup = BeautifulSoup(html_text, 'html.parser') + +details = soup.find("div", {"id": "details"}).tbody +tr_list = details.find_all("tr") +reason = tr_list[8].td.text.strip() + +stack = "" +count = 0 +frames = soup.find("div", {"id": "frames"}).tbody +for tr in frames.find_all("tr"): +td_list = tr.find_all("td") +source = td_list[3].text.strip() +if source and count <= 10: +source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "") +stack += source + "\n" +count += 1 + +if stack: +#multiline +stack = "\"" + stack + "\"" +return reason, stack + + +if __name__ ==