[MediaWiki-commits] [Gerrit] shell bug, weblogs2 to combine multi-sourced data - change (analytics/zero-sms)
Yurik has uploaded a new change for review. https://gerrit.wikimedia.org/r/168195 Change subject: shell bug, weblogs2 to combine multi-sourced data .. shell bug, weblogs2 to combine multi-sourced data Change-Id: Idc09a1ee5434a8e134530cbb722f25430efff6b8 --- M scripts/run-hivezero.sh A scripts/weblogs2.py M scripts/zero-counts.hql 3 files changed, 301 insertions(+), 29 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/zero-sms refs/changes/95/168195/1 diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh index c1a8e96..e04685f 100755 --- a/scripts/run-hivezero.sh +++ b/scripts/run-hivezero.sh @@ -8,6 +8,6 @@ for ((day = $3; day = $last; day++)); do printf -v p %04d-%02d-%02d $1 $2 $day - echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d date=$p - hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d date=$p + echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d date=$p + hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d date=$p done diff --git a/scripts/weblogs2.py b/scripts/weblogs2.py new file mode 100644 index 000..2432422 --- /dev/null +++ b/scripts/weblogs2.py @@ -0,0 +1,299 @@ +# coding=utf-8 +import StringIO +import re +import collections +from pandas import read_table, pivot_table +from pandas.core.frame import DataFrame, Series +import numpy as np + +from logprocessor import * + + +def addStat(stats, date, dataType, xcs, via, ipset, https, lang, subdomain, site): +key = (date, dataType, xcs, via, ipset, 'https' if https else 'http', lang, subdomain, site) +if key in stats: +stats[key] += 1 +else: +datetime.strptime(date, '%Y-%m-%d') # Validate date - slow operation, do it only once per key +stats[key] = 1 + + +columnHdrCache = u'xcs,via,ipset,https,lang,subdomain,site,count'.split(',') +columnHdrCacheLegacy = u'date,type,xcs,via,ipset,https,lang,subdomain,site,count'.split(',') +columnHdrResult = u'date,type,xcs,via,ipset,https,lang,subdomain,site,iszero,ison,count'.split(',') +validSubDomains = {'m', 'zero', 'mobile', 'wap'} +validHttpCode = {'200', '304'} + + +class WebLogProcessor2(LogProcessor): +def __init__(self, settingsFile='settings/weblogs2.json'): +super(WebLogProcessor2, self).__init__(settingsFile, 'web2') + +self._configs = None +self.dateDirRe = re.compile(r'^date=(\d\d\d\d-\d\d-\d\d)$') +self.fileRe = re.compile(r'^\d+') +self.combinedFile = os.path.join(self.pathGraphs, 'combined-all.tsv') +if self.settings.pathCacheLegacy: +self.pathCacheLegacy = self.normalizePath(self.settings.pathCacheLegacy) +else: +self.pathCacheLegacy = self.settings.pathCacheLegacy + +self.legacyFileRe = re.compile(r'^(zero\.tsv\.log-(\d+)\.gz)__\d+\.tsv$', re.IGNORECASE) + +def defaultSettings(self, suffix): +s = super(WebLogProcessor2, self).defaultSettings(suffix) +s.pathCacheLegacy = False +return s + +def downloadConfigs(self): +if self._configs: +return self._configs +wiki = self.getWiki() +# https://zero.wikimedia.org/w/api.php?action=zeroportaltype=analyticsconfigformat=jsonfm +configs = wiki('zeroportal', type='analyticsconfig').zeroportal +for cfs in configs.values(): +for c in cfs: +c['from'] = datetime.strptime(c['from'], '%Y-%m-%dT%H:%M:%SZ') +if c.before is None: +c.before = datetime.max +else: +c.before = datetime.strptime(c.before, '%Y-%m-%dT%H:%M:%SZ') +c.languages = True if True == c.languages else set(c.languages) +c.sites = True if True == c.sites else set(c.sites) +c.via = set(c.via) +c.ipsets = set(c.ipsets) +self._configs = configs +return self._configs + +def combineStatsLegacy(self): +if not self.pathCacheLegacy: +return {} +safePrint('Combine legacy stat files') +# Logs did not contain the VIA X-Analytics tag before this date +ignoreViaBefore = datetime(2014, 3, 22) +configs = self.downloadConfigs() +stats = collections.defaultdict(int) +for f in os.listdir(self.pathCacheLegacy): +if not self.legacyFileRe.match(f): +continue +for vals in readData(os.path.join(self.pathCacheLegacy, f), columnHdrCacheLegacy): +# 0 12 3 4 56 78 9 +# 2014-07-25 DATA 250-99 DIRECT default http ru zero wikipedia 1000 +if len(vals) != 10: +if len(vals) == 11 and vals[3] == '': +safePrint('Fixing extra empty xcs in file %s' % f) +del vals[3] +
[MediaWiki-commits] [Gerrit] shell bug, weblogs2 to combine multi-sourced data - change (analytics/zero-sms)
Yurik has submitted this change and it was merged. Change subject: shell bug, weblogs2 to combine multi-sourced data .. shell bug, weblogs2 to combine multi-sourced data Change-Id: Idc09a1ee5434a8e134530cbb722f25430efff6b8 --- M scripts/run-hivezero.sh A scripts/weblogs2.py M scripts/zero-counts.hql 3 files changed, 301 insertions(+), 29 deletions(-) Approvals: Yurik: Verified; Looks good to me, approved diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh index c1a8e96..e04685f 100755 --- a/scripts/run-hivezero.sh +++ b/scripts/run-hivezero.sh @@ -8,6 +8,6 @@ for ((day = $3; day = $last; day++)); do printf -v p %04d-%02d-%02d $1 $2 $day - echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d date=$p - hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d date=$p + echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d date=$p + hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d date=$p done diff --git a/scripts/weblogs2.py b/scripts/weblogs2.py new file mode 100644 index 000..2432422 --- /dev/null +++ b/scripts/weblogs2.py @@ -0,0 +1,299 @@ +# coding=utf-8 +import StringIO +import re +import collections +from pandas import read_table, pivot_table +from pandas.core.frame import DataFrame, Series +import numpy as np + +from logprocessor import * + + +def addStat(stats, date, dataType, xcs, via, ipset, https, lang, subdomain, site): +key = (date, dataType, xcs, via, ipset, 'https' if https else 'http', lang, subdomain, site) +if key in stats: +stats[key] += 1 +else: +datetime.strptime(date, '%Y-%m-%d') # Validate date - slow operation, do it only once per key +stats[key] = 1 + + +columnHdrCache = u'xcs,via,ipset,https,lang,subdomain,site,count'.split(',') +columnHdrCacheLegacy = u'date,type,xcs,via,ipset,https,lang,subdomain,site,count'.split(',') +columnHdrResult = u'date,type,xcs,via,ipset,https,lang,subdomain,site,iszero,ison,count'.split(',') +validSubDomains = {'m', 'zero', 'mobile', 'wap'} +validHttpCode = {'200', '304'} + + +class WebLogProcessor2(LogProcessor): +def __init__(self, settingsFile='settings/weblogs2.json'): +super(WebLogProcessor2, self).__init__(settingsFile, 'web2') + +self._configs = None +self.dateDirRe = re.compile(r'^date=(\d\d\d\d-\d\d-\d\d)$') +self.fileRe = re.compile(r'^\d+') +self.combinedFile = os.path.join(self.pathGraphs, 'combined-all.tsv') +if self.settings.pathCacheLegacy: +self.pathCacheLegacy = self.normalizePath(self.settings.pathCacheLegacy) +else: +self.pathCacheLegacy = self.settings.pathCacheLegacy + +self.legacyFileRe = re.compile(r'^(zero\.tsv\.log-(\d+)\.gz)__\d+\.tsv$', re.IGNORECASE) + +def defaultSettings(self, suffix): +s = super(WebLogProcessor2, self).defaultSettings(suffix) +s.pathCacheLegacy = False +return s + +def downloadConfigs(self): +if self._configs: +return self._configs +wiki = self.getWiki() +# https://zero.wikimedia.org/w/api.php?action=zeroportaltype=analyticsconfigformat=jsonfm +configs = wiki('zeroportal', type='analyticsconfig').zeroportal +for cfs in configs.values(): +for c in cfs: +c['from'] = datetime.strptime(c['from'], '%Y-%m-%dT%H:%M:%SZ') +if c.before is None: +c.before = datetime.max +else: +c.before = datetime.strptime(c.before, '%Y-%m-%dT%H:%M:%SZ') +c.languages = True if True == c.languages else set(c.languages) +c.sites = True if True == c.sites else set(c.sites) +c.via = set(c.via) +c.ipsets = set(c.ipsets) +self._configs = configs +return self._configs + +def combineStatsLegacy(self): +if not self.pathCacheLegacy: +return {} +safePrint('Combine legacy stat files') +# Logs did not contain the VIA X-Analytics tag before this date +ignoreViaBefore = datetime(2014, 3, 22) +configs = self.downloadConfigs() +stats = collections.defaultdict(int) +for f in os.listdir(self.pathCacheLegacy): +if not self.legacyFileRe.match(f): +continue +for vals in readData(os.path.join(self.pathCacheLegacy, f), columnHdrCacheLegacy): +# 0 12 3 4 56 78 9 +# 2014-07-25 DATA 250-99 DIRECT default http ru zero wikipedia 1000 +if len(vals) != 10: +if len(vals) == 11 and vals[3] == '': +safePrint('Fixing extra empty xcs in file %s' % f) +del vals[3] +else: +raise