[MediaWiki-commits] [Gerrit] clone script, overwrite error, mem error - change (analytics/zero-sms)
Yurik has uploaded a new change for review. https://gerrit.wikimedia.org/r/168333 Change subject: clone script, overwrite error, mem error .. clone script, overwrite error, mem error Change-Id: Icc87571d1d897969184516c4dc0ea5b9cfdcdf63 --- A scripts/clone-xcs.hql M scripts/run-hivezero.sh M scripts/weblogs2.py M scripts/zero-counts.hql 4 files changed, 88 insertions(+), 10 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/zero-sms refs/changes/33/168333/1 diff --git a/scripts/clone-xcs.hql b/scripts/clone-xcs.hql new file mode 100644 index 000..5e7f500 --- /dev/null +++ b/scripts/clone-xcs.hql @@ -0,0 +1,69 @@ +set hive.exec.dynamic.partition.mode=nonstrict; +SET hive.exec.compress.output=false; +--^ To work around HIVE-3296, we have SETs before any comments + +-- Clone one day worth of data to a temp table +-- +-- Usage: +-- hive -f clone-xcs.hql -d year=2014 -d month=9 -d day=15 -d xcs=515-05 -d table=tmp_clone +-- +-- set hivevar:year=2014; +-- set hivevar:month=10; +-- set hivevar:day=21; +-- set hivevar:xcs=515-05; +-- set hivevar:table=tmp_clone; + +use yurik; + +CREATE TABLE IF NOT EXISTS ${table} ( + hostname string, + sequence bigint, + dt string, + time_firstbyte float, + ip string, + cache_status string, + http_status string, + response_size bigint, + http_method string, + uri_host string, + uri_path string, + uri_query string, + content_type string, + referer string, + x_forwarded_for string, + user_agent string, + accept_language string, + x_analytics string) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY '\t'; + +INSERT OVERWRITE TABLE ${table} + +SELECT +* +FROM wmf_raw.webrequest +WHERE +webrequest_source IN ('text', 'mobile') +AND year=${year} +AND month=${month} +AND day=${day} +AND x_analytics LIKE '%zero=%' +AND SUBSTR(uri_path, 1, 6) = '/wiki/' +AND ( +( +SUBSTR(ip, 1, 9) != '10.128.0.' +AND SUBSTR(ip, 1, 11) NOT IN ( +'208.80.152.', +'208.80.153.', +'208.80.154.', +'208.80.155.', +'91.198.174.' +) +) OR x_forwarded_for != '-' +) +AND SUBSTR(uri_path, 1, 31) != '/wiki/Special:CentralAutoLogin/' +AND http_status NOT IN ( '301', '302', '303' ) +AND uri_host RLIKE '^[A-Za-z0-9-]+(\\.(zero|m))?\\.[a-z]*\\.org$' +AND NOT (SPLIT(TRANSLATE(SUBSTR(uri_path, 7), ' ', '_'), '#')[0] RLIKE '^[Uu]ndefined$') +AND regexp_extract(x_analytics, 'zero=([^\;]+)') = ${xcs} + diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh index e04685f..0df0a65 100755 --- a/scripts/run-hivezero.sh +++ b/scripts/run-hivezero.sh @@ -9,5 +9,5 @@ for ((day = $3; day = $last; day++)); do printf -v p %04d-%02d-%02d $1 $2 $day echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d date=$p - hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d date=$p + export HADOOP_HEAPSIZE=1024 hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d date=$p done diff --git a/scripts/weblogs2.py b/scripts/weblogs2.py index 2432422..0812611 100644 --- a/scripts/weblogs2.py +++ b/scripts/weblogs2.py @@ -2,6 +2,7 @@ import StringIO import re import collections + from pandas import read_table, pivot_table from pandas.core.frame import DataFrame, Series import numpy as np @@ -36,7 +37,7 @@ if self.settings.pathCacheLegacy: self.pathCacheLegacy = self.normalizePath(self.settings.pathCacheLegacy) else: -self.pathCacheLegacy = self.settings.pathCacheLegacy +self.pathCacheLegacy = False self.legacyFileRe = re.compile(r'^(zero\.tsv\.log-(\d+)\.gz)__\d+\.tsv$', re.IGNORECASE) @@ -86,6 +87,8 @@ else: raise ValueError('Unrecognized key (%s) in file %s' % (joinValues(vals), f)) (dt, typ, xcs, via, ipset, https, lang, subdomain, site, count) = vals + +via = via.upper() error = False if xcs == '404-01b': @@ -151,7 +154,8 @@ # 0 1 2 34 56 7 # 250-99 DIRECT default http ru zero wikipedia 1000 (xcs, via, ipset, https, lang, subdomain, site, count) = vals -via = via if via else u'DIRECT' + +via = via.upper() if via else u'DIRECT' ipset = ipset if ipset else u'default' https = https if https else u'http' @@ -195,9 +199,9 @@ if legacyStats:
[MediaWiki-commits] [Gerrit] clone script, overwrite error, mem error - change (analytics/zero-sms)
Yurik has submitted this change and it was merged. Change subject: clone script, overwrite error, mem error .. clone script, overwrite error, mem error Change-Id: Icc87571d1d897969184516c4dc0ea5b9cfdcdf63 --- A scripts/clone-xcs.hql M scripts/run-hivezero.sh M scripts/weblogs2.py M scripts/zero-counts.hql 4 files changed, 88 insertions(+), 10 deletions(-) Approvals: Yurik: Verified; Looks good to me, approved diff --git a/scripts/clone-xcs.hql b/scripts/clone-xcs.hql new file mode 100644 index 000..5e7f500 --- /dev/null +++ b/scripts/clone-xcs.hql @@ -0,0 +1,69 @@ +set hive.exec.dynamic.partition.mode=nonstrict; +SET hive.exec.compress.output=false; +--^ To work around HIVE-3296, we have SETs before any comments + +-- Clone one day worth of data to a temp table +-- +-- Usage: +-- hive -f clone-xcs.hql -d year=2014 -d month=9 -d day=15 -d xcs=515-05 -d table=tmp_clone +-- +-- set hivevar:year=2014; +-- set hivevar:month=10; +-- set hivevar:day=21; +-- set hivevar:xcs=515-05; +-- set hivevar:table=tmp_clone; + +use yurik; + +CREATE TABLE IF NOT EXISTS ${table} ( + hostname string, + sequence bigint, + dt string, + time_firstbyte float, + ip string, + cache_status string, + http_status string, + response_size bigint, + http_method string, + uri_host string, + uri_path string, + uri_query string, + content_type string, + referer string, + x_forwarded_for string, + user_agent string, + accept_language string, + x_analytics string) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY '\t'; + +INSERT OVERWRITE TABLE ${table} + +SELECT +* +FROM wmf_raw.webrequest +WHERE +webrequest_source IN ('text', 'mobile') +AND year=${year} +AND month=${month} +AND day=${day} +AND x_analytics LIKE '%zero=%' +AND SUBSTR(uri_path, 1, 6) = '/wiki/' +AND ( +( +SUBSTR(ip, 1, 9) != '10.128.0.' +AND SUBSTR(ip, 1, 11) NOT IN ( +'208.80.152.', +'208.80.153.', +'208.80.154.', +'208.80.155.', +'91.198.174.' +) +) OR x_forwarded_for != '-' +) +AND SUBSTR(uri_path, 1, 31) != '/wiki/Special:CentralAutoLogin/' +AND http_status NOT IN ( '301', '302', '303' ) +AND uri_host RLIKE '^[A-Za-z0-9-]+(\\.(zero|m))?\\.[a-z]*\\.org$' +AND NOT (SPLIT(TRANSLATE(SUBSTR(uri_path, 7), ' ', '_'), '#')[0] RLIKE '^[Uu]ndefined$') +AND regexp_extract(x_analytics, 'zero=([^\;]+)') = ${xcs} + diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh index e04685f..0df0a65 100755 --- a/scripts/run-hivezero.sh +++ b/scripts/run-hivezero.sh @@ -9,5 +9,5 @@ for ((day = $3; day = $last; day++)); do printf -v p %04d-%02d-%02d $1 $2 $day echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d date=$p - hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d date=$p + export HADOOP_HEAPSIZE=1024 hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d date=$p done diff --git a/scripts/weblogs2.py b/scripts/weblogs2.py index 2432422..0812611 100644 --- a/scripts/weblogs2.py +++ b/scripts/weblogs2.py @@ -2,6 +2,7 @@ import StringIO import re import collections + from pandas import read_table, pivot_table from pandas.core.frame import DataFrame, Series import numpy as np @@ -36,7 +37,7 @@ if self.settings.pathCacheLegacy: self.pathCacheLegacy = self.normalizePath(self.settings.pathCacheLegacy) else: -self.pathCacheLegacy = self.settings.pathCacheLegacy +self.pathCacheLegacy = False self.legacyFileRe = re.compile(r'^(zero\.tsv\.log-(\d+)\.gz)__\d+\.tsv$', re.IGNORECASE) @@ -86,6 +87,8 @@ else: raise ValueError('Unrecognized key (%s) in file %s' % (joinValues(vals), f)) (dt, typ, xcs, via, ipset, https, lang, subdomain, site, count) = vals + +via = via.upper() error = False if xcs == '404-01b': @@ -151,7 +154,8 @@ # 0 1 2 34 56 7 # 250-99 DIRECT default http ru zero wikipedia 1000 (xcs, via, ipset, https, lang, subdomain, site, count) = vals -via = via if via else u'DIRECT' + +via = via.upper() if via else u'DIRECT' ipset = ipset if ipset else u'default' https = https if https else u'http' @@ -195,9 +199,9 @@ if legacyStats: # Only add legacy data for dates that we haven't seen in hadoop