Author: sebb
Date: Sat Sep 26 00:25:27 2015
New Revision: 1705389

URL: http://svn.apache.org/viewvc?rev=1705389&view=rev
Log:
COMDEV-161 mailglomper.py may count a message multiple times
Fixed RE to look for "From " at the start of a line
Also changed code to read data by line rather than slurping entire mailbox into 
memory
Added some timestamp traces to check on performance

Modified:
    comdev/reporter.apache.org/trunk/mailglomper.py

Modified: comdev/reporter.apache.org/trunk/mailglomper.py
URL: 
http://svn.apache.org/viewvc/comdev/reporter.apache.org/trunk/mailglomper.py?rev=1705389&r1=1705388&r2=1705389&view=diff
==============================================================================
--- comdev/reporter.apache.org/trunk/mailglomper.py (original)
+++ comdev/reporter.apache.org/trunk/mailglomper.py Sat Sep 26 00:25:27 2015
@@ -58,18 +58,29 @@ Potentially the generated file could use
 but this would require converting the input file and potentially allowing both 
separators in
 the files that process the output for a short while.
 """
+
+def tsprint(s): # print with timestamp
+    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), s
+
+tsprint("Started")
+
 for mlist in re.finditer(r"<a href='([-a-z0-9]+)/'", data):
     ml = mlist.group(1)
+    start = time.time()
+#     print(ml)
     y += 1
     mls[ml] = {}
     mls[ml]['quarterly'] = [0, 0];
     mls[ml]['weekly'] = {}
-    for date in months:
-            
+    mlct = 0
+    for date in months:            
         try:
-            mldata = 
urllib.urlopen("http://mail-archives.us.apache.org/mod_mbox/%s/%s.mbox"; % (ml, 
date)).read()
-            if mldata:
-                for c in re.finditer(r"Date: (.+)", mldata):
+            ct = 0
+            mldata = 
urllib.urlopen("http://mail-archives.us.apache.org/mod_mbox/%s/%s.mbox"; % (ml, 
date))
+            for line in mldata:
+                c = re.match(r"^From \S+ (.+)", line)
+                if c:
+                    ct += 1
                     try:
                         d = email.utils.parsedate(c.group(1))
                         timestamp = int(time.mktime(d))
@@ -79,16 +90,19 @@ for mlist in re.finditer(r"<a href='([-a
                             mls[ml]['quarterly'][0] += 1
                         elif timestamp >= wayafter:
                             mls[ml]['quarterly'][1] += 1
-                    except:
+                    except Exception as err:
+                        tsprint(err)
                         pass
-                        
+#             tsprint("%s %s: has  %u mails" % (ml, date, ct)) # total for 
month
+            mlct += ct
         except Exception as err:
-            print(err)
-    print("%s: %u" % (ml, mls[ml]['quarterly'][0]))
+            tsprint(err)
+    tsprint("Info: %s has  %u mails (%u secs)" % (ml, mlct, time.time() - 
start)) # total for mail group
     if y == 50:
         y = 0
         with open("data/maildata_extended.json",'w+') as f:
             f.write(json.dumps(mls, indent=1))
+tsprint("Completed scanning, writing JSON")
 with open("data/maildata_extended.json",'w+') as f:
     f.write(json.dumps(mls, indent=1))
 print("Dumped JSON")


Reply via email to