Author: sebb
Date: Sat Sep 26 00:25:27 2015
New Revision: 1705389
URL: http://svn.apache.org/viewvc?rev=1705389&view=rev
Log:
COMDEV-161 mailglomper.py may count a message multiple times
Fixed RE to look for "From " at the start of a line
Also changed code to read data by line rather than slurping entire mailbox into
memory
Added some timestamp traces to check on performance
Modified:
comdev/reporter.apache.org/trunk/mailglomper.py
Modified: comdev/reporter.apache.org/trunk/mailglomper.py
URL:
http://svn.apache.org/viewvc/comdev/reporter.apache.org/trunk/mailglomper.py?rev=1705389&r1=1705388&r2=1705389&view=diff
==============================================================================
--- comdev/reporter.apache.org/trunk/mailglomper.py (original)
+++ comdev/reporter.apache.org/trunk/mailglomper.py Sat Sep 26 00:25:27 2015
@@ -58,18 +58,29 @@ Potentially the generated file could use
but this would require converting the input file and potentially allowing both
separators in
the files that process the output for a short while.
"""
+
+def tsprint(s): # print with timestamp
+ print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), s
+
+tsprint("Started")
+
for mlist in re.finditer(r"<a href='([-a-z0-9]+)/'", data):
ml = mlist.group(1)
+ start = time.time()
+# print(ml)
y += 1
mls[ml] = {}
mls[ml]['quarterly'] = [0, 0];
mls[ml]['weekly'] = {}
- for date in months:
-
+ mlct = 0
+ for date in months:
try:
- mldata =
urllib.urlopen("http://mail-archives.us.apache.org/mod_mbox/%s/%s.mbox" % (ml,
date)).read()
- if mldata:
- for c in re.finditer(r"Date: (.+)", mldata):
+ ct = 0
+ mldata =
urllib.urlopen("http://mail-archives.us.apache.org/mod_mbox/%s/%s.mbox" % (ml,
date))
+ for line in mldata:
+ c = re.match(r"^From \S+ (.+)", line)
+ if c:
+ ct += 1
try:
d = email.utils.parsedate(c.group(1))
timestamp = int(time.mktime(d))
@@ -79,16 +90,19 @@ for mlist in re.finditer(r"<a href='([-a
mls[ml]['quarterly'][0] += 1
elif timestamp >= wayafter:
mls[ml]['quarterly'][1] += 1
- except:
+ except Exception as err:
+ tsprint(err)
pass
-
+# tsprint("%s %s: has %u mails" % (ml, date, ct)) # total for
month
+ mlct += ct
except Exception as err:
- print(err)
- print("%s: %u" % (ml, mls[ml]['quarterly'][0]))
+ tsprint(err)
+ tsprint("Info: %s has %u mails (%u secs)" % (ml, mlct, time.time() -
start)) # total for mail group
if y == 50:
y = 0
with open("data/maildata_extended.json",'w+') as f:
f.write(json.dumps(mls, indent=1))
+tsprint("Completed scanning, writing JSON")
with open("data/maildata_extended.json",'w+') as f:
f.write(json.dumps(mls, indent=1))
print("Dumped JSON")