Yurik has submitted this change and it was merged.

Change subject: error counting
......................................................................


error counting

Change-Id: I667637cbc3707413238ef509abcb0833c9b25174
---
M scripts/weblogs.py
1 file changed, 33 insertions(+), 24 deletions(-)

Approvals:
  Yurik: Verified; Looks good to me, approved



diff --git a/scripts/weblogs.py b/scripts/weblogs.py
index e465ad9..0057fdf 100644
--- a/scripts/weblogs.py
+++ b/scripts/weblogs.py
@@ -56,7 +56,7 @@
         self.pathGraphs = self.normalizePath(self.rawPathGraphs)
 
         # zero.tsv.log-20140808.gz
-        logReStr = r'zero\.tsv\.log-\d+\.gz'
+        logReStr = r'zero\.tsv\.log-(\d+)\.gz'
         self.logFileRe = re.compile(r'^' + logReStr + r'$', re.IGNORECASE)
         self.statFileRe = re.compile(r'^(' + logReStr + r')__\d+\.json$', 
re.IGNORECASE)
         self.urlRe = re.compile(r'^https?://([^/]+)', re.IGNORECASE)
@@ -114,14 +114,17 @@
         safePrint('Processing log files')
         statFiles = {}
         for f in os.listdir(self.pathLogs):
-            if not self.logFileRe.match(f):
+            m = self.logFileRe.match(f)
+            if not m:
                 continue
             logFile = os.path.join(self.pathLogs, f)
             logSize = os.stat(logFile).st_size
             statFile = os.path.join(self.pathStats, f + '__' + 
unicode(logSize) + '.json')
             statFiles[f] = statFile
             if not os.path.exists(statFile):
-                self.processLogFile(logFile, statFile)
+                err = m.group(1)
+                err = '-'.join([err[0:4], err[4:6], err[6:8]])
+                self.processLogFile(logFile, statFile, err)
 
         # Clean up older stat files (if gz file size has changed)
         removeFiles = []
@@ -137,7 +140,7 @@
         for f in removeFiles:
             os.remove(f)
 
-    def processLogFile(self, logFile, statFile):
+    def processLogFile(self, logFile, statFile, errDate):
         """
             0  cp1046.eqiad.wmnet
             1  13866141087
@@ -156,32 +159,32 @@
             .. Version/4.0 Mobile Safari/534.30
             -2 en-US
             -1 zero=410-01
-        :param logFile:
-        :param statFile:
-        :return:
         """
 
         safePrint('Processing %s' % logFile)
         stats = {}
-        count = 1
+        count = 0
+        errors = 0
 
         if logFile.endswith('.gz'):
             streamData = 
io.TextIOWrapper(io.BufferedReader(gzip.open(logFile)), encoding='utf8', 
errors='ignore')
         else:
             streamData = io.open(logFile, 'r', encoding='utf8', 
errors='ignore')
         for line in streamData:
+            count += 1
             if count % 500000 == 0:
                 safePrint('%d lines processed' % count)
-            count += 1
 
             l = line.strip('\n\r').split('\t')
 
             if len(l) < 16:
                 safePrint(u'String too short - %d parts\n%s' % (len(l), line))
+                errors += 1
                 continue
             analytics = l[-1]
             if '=' not in analytics:  # X-Analytics should have at least some 
values
                 safePrint(u'Analytics is not valid - "%s"\n%s' % (analytics, 
line))
+                errors += 1
                 continue
 
             host = l[8]
@@ -189,12 +192,10 @@
                 m = self.duplUrlRe.match(host)
                 if m:
                     host = host[len(m.group(1)):]
-                else:
-                    safePrint(u'Duplicate URL failed: "%s"\n%s' % (host, line))
-                    continue
             m = self.urlRe.match(host)
             if not m:
                 safePrint(u'URL parsing failed: "%s"\n%s' % (host, line))
+                errors += 1
                 continue
             host = m.group(1)
             if host.endswith(':80'):
@@ -202,22 +203,28 @@
             if host.endswith('.'):
                 host = host[:-1]
             hostParts = host.split('.')
-            hostParts.pop()  # assume this is the domain root, e.g. org, net, 
info, net, ...
             if hostParts[0] == 'www':
                 del hostParts[0]
-            site = hostParts.pop()
-            if hostParts:
-                subdomain = hostParts.pop()
-                if subdomain == 'm' or subdomain == 'zero':
-                    site = subdomain + '.' + site
-                    lang = hostParts.pop() if hostParts else ''
-                else:
-                    lang = subdomain
+            lang = ''
+            if len(hostParts) >= 2:
+                hostParts.pop()  # assume last element is the domain root, 
e.g. org, net, info, net, ...
+                site = hostParts.pop()
+                if hostParts:
+                    subdomain = hostParts.pop()
+                    if subdomain == 'mobile':
+                        subdomain = 'm'
+                    if subdomain == 'm' or subdomain == 'zero':
+                        site = subdomain + '.' + site
+                        lang = hostParts.pop() if hostParts else ''
+                    else:
+                        lang = subdomain
             else:
-                lang = ''
+                hostParts = False
+                site = ''
 
-            if hostParts:
+            if hostParts or False == hostParts:
                 safePrint(u'Unknown host %s\n%s' % (host, line))
+                errors += 1
                 continue
 
             analytics = dict([x.split('=', 2) for x in 
set(analytics.split(';'))])
@@ -234,7 +241,9 @@
             else:
                 datetime.strptime(dt, '%Y-%m-%d')  # Validate date - slow 
operation, do it only once per key
                 stats[key] = 1
-
+        if errors > 0:
+            key = '|'.join([errDate, '000-00', 'ERROR', 'default', 'http', '', 
'errors'])
+            stats[key] = errors
         saveJson(statFile, stats)
 
     def combineStats(self, tempFile=''):

-- 
To view, visit https://gerrit.wikimedia.org/r/153371
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I667637cbc3707413238ef509abcb0833c9b25174
Gerrit-PatchSet: 1
Gerrit-Project: analytics/zero-sms
Gerrit-Branch: master
Gerrit-Owner: Yurik <[email protected]>
Gerrit-Reviewer: Yurik <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to