[MediaWiki-commits] [Gerrit] shell bug, weblogs2 to combine multi-sourced data - change (analytics/zero-sms)

2014-10-22 Thread Yurik (Code Review)
Yurik has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/168195

Change subject: shell bug, weblogs2 to combine multi-sourced data
..

shell bug, weblogs2 to combine multi-sourced data

Change-Id: Idc09a1ee5434a8e134530cbb722f25430efff6b8
---
M scripts/run-hivezero.sh
A scripts/weblogs2.py
M scripts/zero-counts.hql
3 files changed, 301 insertions(+), 29 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/zero-sms 
refs/changes/95/168195/1

diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh
index c1a8e96..e04685f 100755
--- a/scripts/run-hivezero.sh
+++ b/scripts/run-hivezero.sh
@@ -8,6 +8,6 @@
 
 for ((day = $3; day = $last; day++)); do
printf -v p %04d-%02d-%02d $1 $2 $day
-   echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d 
date=$p
-   hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d 
date=$p
+   echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day 
-d date=$p
+   hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d 
date=$p
 done
diff --git a/scripts/weblogs2.py b/scripts/weblogs2.py
new file mode 100644
index 000..2432422
--- /dev/null
+++ b/scripts/weblogs2.py
@@ -0,0 +1,299 @@
+# coding=utf-8
+import StringIO
+import re
+import collections
+from pandas import read_table, pivot_table
+from pandas.core.frame import DataFrame, Series
+import numpy as np
+
+from logprocessor import *
+
+
+def addStat(stats, date, dataType, xcs, via, ipset, https, lang, subdomain, 
site):
+key = (date, dataType, xcs, via, ipset, 'https' if https else 'http', 
lang, subdomain, site)
+if key in stats:
+stats[key] += 1
+else:
+datetime.strptime(date, '%Y-%m-%d')  # Validate date - slow operation, 
do it only once per key
+stats[key] = 1
+
+
+columnHdrCache = u'xcs,via,ipset,https,lang,subdomain,site,count'.split(',')
+columnHdrCacheLegacy = 
u'date,type,xcs,via,ipset,https,lang,subdomain,site,count'.split(',')
+columnHdrResult = 
u'date,type,xcs,via,ipset,https,lang,subdomain,site,iszero,ison,count'.split(',')
+validSubDomains = {'m', 'zero', 'mobile', 'wap'}
+validHttpCode = {'200', '304'}
+
+
+class WebLogProcessor2(LogProcessor):
+def __init__(self, settingsFile='settings/weblogs2.json'):
+super(WebLogProcessor2, self).__init__(settingsFile, 'web2')
+
+self._configs = None
+self.dateDirRe = re.compile(r'^date=(\d\d\d\d-\d\d-\d\d)$')
+self.fileRe = re.compile(r'^\d+')
+self.combinedFile = os.path.join(self.pathGraphs, 'combined-all.tsv')
+if self.settings.pathCacheLegacy:
+self.pathCacheLegacy = 
self.normalizePath(self.settings.pathCacheLegacy)
+else:
+self.pathCacheLegacy = self.settings.pathCacheLegacy
+
+self.legacyFileRe = 
re.compile(r'^(zero\.tsv\.log-(\d+)\.gz)__\d+\.tsv$', re.IGNORECASE)
+
+def defaultSettings(self, suffix):
+s = super(WebLogProcessor2, self).defaultSettings(suffix)
+s.pathCacheLegacy = False
+return s
+
+def downloadConfigs(self):
+if self._configs:
+return self._configs
+wiki = self.getWiki()
+# 
https://zero.wikimedia.org/w/api.php?action=zeroportaltype=analyticsconfigformat=jsonfm
+configs = wiki('zeroportal', type='analyticsconfig').zeroportal
+for cfs in configs.values():
+for c in cfs:
+c['from'] = datetime.strptime(c['from'], '%Y-%m-%dT%H:%M:%SZ')
+if c.before is None:
+c.before = datetime.max
+else:
+c.before = datetime.strptime(c.before, 
'%Y-%m-%dT%H:%M:%SZ')
+c.languages = True if True == c.languages else set(c.languages)
+c.sites = True if True == c.sites else set(c.sites)
+c.via = set(c.via)
+c.ipsets = set(c.ipsets)
+self._configs = configs
+return self._configs
+
+def combineStatsLegacy(self):
+if not self.pathCacheLegacy:
+return {}
+safePrint('Combine legacy stat files')
+# Logs did not contain the VIA X-Analytics tag before this date
+ignoreViaBefore = datetime(2014, 3, 22)
+configs = self.downloadConfigs()
+stats = collections.defaultdict(int)
+for f in os.listdir(self.pathCacheLegacy):
+if not self.legacyFileRe.match(f):
+continue
+for vals in readData(os.path.join(self.pathCacheLegacy, f), 
columnHdrCacheLegacy):
+# 0  12  3  4   56  78
 9
+# 2014-07-25 DATA 250-99 DIRECT default http ru zero 
wikipedia 1000
+if len(vals) != 10:
+if len(vals) == 11 and vals[3] == '':
+safePrint('Fixing extra empty xcs in file %s' % f)
+del vals[3]
+  

[MediaWiki-commits] [Gerrit] shell bug, weblogs2 to combine multi-sourced data - change (analytics/zero-sms)

2014-10-22 Thread Yurik (Code Review)
Yurik has submitted this change and it was merged.

Change subject: shell bug, weblogs2 to combine multi-sourced data
..


shell bug, weblogs2 to combine multi-sourced data

Change-Id: Idc09a1ee5434a8e134530cbb722f25430efff6b8
---
M scripts/run-hivezero.sh
A scripts/weblogs2.py
M scripts/zero-counts.hql
3 files changed, 301 insertions(+), 29 deletions(-)

Approvals:
  Yurik: Verified; Looks good to me, approved



diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh
index c1a8e96..e04685f 100755
--- a/scripts/run-hivezero.sh
+++ b/scripts/run-hivezero.sh
@@ -8,6 +8,6 @@
 
 for ((day = $3; day = $last; day++)); do
printf -v p %04d-%02d-%02d $1 $2 $day
-   echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d 
date=$p
-   hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d 
date=$p
+   echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day 
-d date=$p
+   hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d 
date=$p
 done
diff --git a/scripts/weblogs2.py b/scripts/weblogs2.py
new file mode 100644
index 000..2432422
--- /dev/null
+++ b/scripts/weblogs2.py
@@ -0,0 +1,299 @@
+# coding=utf-8
+import StringIO
+import re
+import collections
+from pandas import read_table, pivot_table
+from pandas.core.frame import DataFrame, Series
+import numpy as np
+
+from logprocessor import *
+
+
+def addStat(stats, date, dataType, xcs, via, ipset, https, lang, subdomain, 
site):
+key = (date, dataType, xcs, via, ipset, 'https' if https else 'http', 
lang, subdomain, site)
+if key in stats:
+stats[key] += 1
+else:
+datetime.strptime(date, '%Y-%m-%d')  # Validate date - slow operation, 
do it only once per key
+stats[key] = 1
+
+
+columnHdrCache = u'xcs,via,ipset,https,lang,subdomain,site,count'.split(',')
+columnHdrCacheLegacy = 
u'date,type,xcs,via,ipset,https,lang,subdomain,site,count'.split(',')
+columnHdrResult = 
u'date,type,xcs,via,ipset,https,lang,subdomain,site,iszero,ison,count'.split(',')
+validSubDomains = {'m', 'zero', 'mobile', 'wap'}
+validHttpCode = {'200', '304'}
+
+
+class WebLogProcessor2(LogProcessor):
+def __init__(self, settingsFile='settings/weblogs2.json'):
+super(WebLogProcessor2, self).__init__(settingsFile, 'web2')
+
+self._configs = None
+self.dateDirRe = re.compile(r'^date=(\d\d\d\d-\d\d-\d\d)$')
+self.fileRe = re.compile(r'^\d+')
+self.combinedFile = os.path.join(self.pathGraphs, 'combined-all.tsv')
+if self.settings.pathCacheLegacy:
+self.pathCacheLegacy = 
self.normalizePath(self.settings.pathCacheLegacy)
+else:
+self.pathCacheLegacy = self.settings.pathCacheLegacy
+
+self.legacyFileRe = 
re.compile(r'^(zero\.tsv\.log-(\d+)\.gz)__\d+\.tsv$', re.IGNORECASE)
+
+def defaultSettings(self, suffix):
+s = super(WebLogProcessor2, self).defaultSettings(suffix)
+s.pathCacheLegacy = False
+return s
+
+def downloadConfigs(self):
+if self._configs:
+return self._configs
+wiki = self.getWiki()
+# 
https://zero.wikimedia.org/w/api.php?action=zeroportaltype=analyticsconfigformat=jsonfm
+configs = wiki('zeroportal', type='analyticsconfig').zeroportal
+for cfs in configs.values():
+for c in cfs:
+c['from'] = datetime.strptime(c['from'], '%Y-%m-%dT%H:%M:%SZ')
+if c.before is None:
+c.before = datetime.max
+else:
+c.before = datetime.strptime(c.before, 
'%Y-%m-%dT%H:%M:%SZ')
+c.languages = True if True == c.languages else set(c.languages)
+c.sites = True if True == c.sites else set(c.sites)
+c.via = set(c.via)
+c.ipsets = set(c.ipsets)
+self._configs = configs
+return self._configs
+
+def combineStatsLegacy(self):
+if not self.pathCacheLegacy:
+return {}
+safePrint('Combine legacy stat files')
+# Logs did not contain the VIA X-Analytics tag before this date
+ignoreViaBefore = datetime(2014, 3, 22)
+configs = self.downloadConfigs()
+stats = collections.defaultdict(int)
+for f in os.listdir(self.pathCacheLegacy):
+if not self.legacyFileRe.match(f):
+continue
+for vals in readData(os.path.join(self.pathCacheLegacy, f), 
columnHdrCacheLegacy):
+# 0  12  3  4   56  78
 9
+# 2014-07-25 DATA 250-99 DIRECT default http ru zero 
wikipedia 1000
+if len(vals) != 10:
+if len(vals) == 11 and vals[3] == '':
+safePrint('Fixing extra empty xcs in file %s' % f)
+del vals[3]
+else:
+raise