[MediaWiki-commits] [Gerrit] clone script, overwrite error, mem error - change (analytics/zero-sms)

2014-10-23 Thread Yurik (Code Review)
Yurik has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/168333

Change subject: clone script, overwrite error, mem error
..

clone script, overwrite error, mem error

Change-Id: Icc87571d1d897969184516c4dc0ea5b9cfdcdf63
---
A scripts/clone-xcs.hql
M scripts/run-hivezero.sh
M scripts/weblogs2.py
M scripts/zero-counts.hql
4 files changed, 88 insertions(+), 10 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/zero-sms 
refs/changes/33/168333/1

diff --git a/scripts/clone-xcs.hql b/scripts/clone-xcs.hql
new file mode 100644
index 000..5e7f500
--- /dev/null
+++ b/scripts/clone-xcs.hql
@@ -0,0 +1,69 @@
+set hive.exec.dynamic.partition.mode=nonstrict;
+SET hive.exec.compress.output=false;
+--^ To work around HIVE-3296, we have SETs before any comments
+
+-- Clone one day worth of data to a temp table
+--
+-- Usage:
+-- hive -f clone-xcs.hql -d year=2014 -d month=9 -d day=15 -d xcs=515-05 
-d table=tmp_clone
+--
+-- set hivevar:year=2014;
+-- set hivevar:month=10;
+-- set hivevar:day=21;
+-- set hivevar:xcs=515-05;
+-- set hivevar:table=tmp_clone;
+
+use yurik;
+
+CREATE TABLE IF NOT EXISTS ${table} (
+  hostname string,
+  sequence bigint,
+  dt string,
+  time_firstbyte float,
+  ip string,
+  cache_status string,
+  http_status string,
+  response_size bigint,
+  http_method string,
+  uri_host string,
+  uri_path string,
+  uri_query string,
+  content_type string,
+  referer string,
+  x_forwarded_for string,
+  user_agent string,
+  accept_language string,
+  x_analytics string)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '\t';
+
+INSERT OVERWRITE TABLE ${table}
+
+SELECT
+*
+FROM wmf_raw.webrequest
+WHERE
+webrequest_source IN ('text', 'mobile')
+AND year=${year}
+AND month=${month}
+AND day=${day}
+AND x_analytics LIKE '%zero=%'
+AND SUBSTR(uri_path, 1, 6) = '/wiki/'
+AND (
+(
+SUBSTR(ip, 1, 9) != '10.128.0.'
+AND SUBSTR(ip, 1, 11) NOT IN (
+'208.80.152.',
+'208.80.153.',
+'208.80.154.',
+'208.80.155.',
+'91.198.174.'
+)
+) OR x_forwarded_for != '-'
+)
+AND SUBSTR(uri_path, 1, 31) != '/wiki/Special:CentralAutoLogin/'
+AND http_status NOT IN ( '301', '302', '303' )
+AND uri_host RLIKE '^[A-Za-z0-9-]+(\\.(zero|m))?\\.[a-z]*\\.org$'
+AND NOT (SPLIT(TRANSLATE(SUBSTR(uri_path, 7), ' ', '_'), '#')[0] 
RLIKE '^[Uu]ndefined$')
+AND regexp_extract(x_analytics, 'zero=([^\;]+)') = ${xcs}
+
diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh
index e04685f..0df0a65 100755
--- a/scripts/run-hivezero.sh
+++ b/scripts/run-hivezero.sh
@@ -9,5 +9,5 @@
 for ((day = $3; day = $last; day++)); do
printf -v p %04d-%02d-%02d $1 $2 $day
echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day 
-d date=$p
-   hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d 
date=$p
+   export HADOOP_HEAPSIZE=1024  hive -f zero-counts.hql -d year=$1 -d 
month=$2 -d day=$day -d date=$p
 done
diff --git a/scripts/weblogs2.py b/scripts/weblogs2.py
index 2432422..0812611 100644
--- a/scripts/weblogs2.py
+++ b/scripts/weblogs2.py
@@ -2,6 +2,7 @@
 import StringIO
 import re
 import collections
+
 from pandas import read_table, pivot_table
 from pandas.core.frame import DataFrame, Series
 import numpy as np
@@ -36,7 +37,7 @@
 if self.settings.pathCacheLegacy:
 self.pathCacheLegacy = 
self.normalizePath(self.settings.pathCacheLegacy)
 else:
-self.pathCacheLegacy = self.settings.pathCacheLegacy
+self.pathCacheLegacy = False
 
 self.legacyFileRe = 
re.compile(r'^(zero\.tsv\.log-(\d+)\.gz)__\d+\.tsv$', re.IGNORECASE)
 
@@ -86,6 +87,8 @@
 else:
 raise ValueError('Unrecognized key (%s) in file %s' % 
(joinValues(vals), f))
 (dt, typ, xcs, via, ipset, https, lang, subdomain, site, 
count) = vals
+
+via = via.upper()
 
 error = False
 if xcs == '404-01b':
@@ -151,7 +154,8 @@
 # 0  1  2   34  56 7
 # 250-99 DIRECT default http ru zero wikipedia 1000
 (xcs, via, ipset, https, lang, subdomain, site, count) = 
vals
-via = via if via else u'DIRECT'
+
+via = via.upper() if via else u'DIRECT'
 ipset = ipset if ipset else u'default'
 https = https if https else u'http'
 
@@ -195,9 +199,9 @@
 
 if legacyStats:
 

[MediaWiki-commits] [Gerrit] clone script, overwrite error, mem error - change (analytics/zero-sms)

2014-10-23 Thread Yurik (Code Review)
Yurik has submitted this change and it was merged.

Change subject: clone script, overwrite error, mem error
..


clone script, overwrite error, mem error

Change-Id: Icc87571d1d897969184516c4dc0ea5b9cfdcdf63
---
A scripts/clone-xcs.hql
M scripts/run-hivezero.sh
M scripts/weblogs2.py
M scripts/zero-counts.hql
4 files changed, 88 insertions(+), 10 deletions(-)

Approvals:
  Yurik: Verified; Looks good to me, approved



diff --git a/scripts/clone-xcs.hql b/scripts/clone-xcs.hql
new file mode 100644
index 000..5e7f500
--- /dev/null
+++ b/scripts/clone-xcs.hql
@@ -0,0 +1,69 @@
+set hive.exec.dynamic.partition.mode=nonstrict;
+SET hive.exec.compress.output=false;
+--^ To work around HIVE-3296, we have SETs before any comments
+
+-- Clone one day worth of data to a temp table
+--
+-- Usage:
+-- hive -f clone-xcs.hql -d year=2014 -d month=9 -d day=15 -d xcs=515-05 
-d table=tmp_clone
+--
+-- set hivevar:year=2014;
+-- set hivevar:month=10;
+-- set hivevar:day=21;
+-- set hivevar:xcs=515-05;
+-- set hivevar:table=tmp_clone;
+
+use yurik;
+
+CREATE TABLE IF NOT EXISTS ${table} (
+  hostname string,
+  sequence bigint,
+  dt string,
+  time_firstbyte float,
+  ip string,
+  cache_status string,
+  http_status string,
+  response_size bigint,
+  http_method string,
+  uri_host string,
+  uri_path string,
+  uri_query string,
+  content_type string,
+  referer string,
+  x_forwarded_for string,
+  user_agent string,
+  accept_language string,
+  x_analytics string)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '\t';
+
+INSERT OVERWRITE TABLE ${table}
+
+SELECT
+*
+FROM wmf_raw.webrequest
+WHERE
+webrequest_source IN ('text', 'mobile')
+AND year=${year}
+AND month=${month}
+AND day=${day}
+AND x_analytics LIKE '%zero=%'
+AND SUBSTR(uri_path, 1, 6) = '/wiki/'
+AND (
+(
+SUBSTR(ip, 1, 9) != '10.128.0.'
+AND SUBSTR(ip, 1, 11) NOT IN (
+'208.80.152.',
+'208.80.153.',
+'208.80.154.',
+'208.80.155.',
+'91.198.174.'
+)
+) OR x_forwarded_for != '-'
+)
+AND SUBSTR(uri_path, 1, 31) != '/wiki/Special:CentralAutoLogin/'
+AND http_status NOT IN ( '301', '302', '303' )
+AND uri_host RLIKE '^[A-Za-z0-9-]+(\\.(zero|m))?\\.[a-z]*\\.org$'
+AND NOT (SPLIT(TRANSLATE(SUBSTR(uri_path, 7), ' ', '_'), '#')[0] 
RLIKE '^[Uu]ndefined$')
+AND regexp_extract(x_analytics, 'zero=([^\;]+)') = ${xcs}
+
diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh
index e04685f..0df0a65 100755
--- a/scripts/run-hivezero.sh
+++ b/scripts/run-hivezero.sh
@@ -9,5 +9,5 @@
 for ((day = $3; day = $last; day++)); do
printf -v p %04d-%02d-%02d $1 $2 $day
echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day 
-d date=$p
-   hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$day -d 
date=$p
+   export HADOOP_HEAPSIZE=1024  hive -f zero-counts.hql -d year=$1 -d 
month=$2 -d day=$day -d date=$p
 done
diff --git a/scripts/weblogs2.py b/scripts/weblogs2.py
index 2432422..0812611 100644
--- a/scripts/weblogs2.py
+++ b/scripts/weblogs2.py
@@ -2,6 +2,7 @@
 import StringIO
 import re
 import collections
+
 from pandas import read_table, pivot_table
 from pandas.core.frame import DataFrame, Series
 import numpy as np
@@ -36,7 +37,7 @@
 if self.settings.pathCacheLegacy:
 self.pathCacheLegacy = 
self.normalizePath(self.settings.pathCacheLegacy)
 else:
-self.pathCacheLegacy = self.settings.pathCacheLegacy
+self.pathCacheLegacy = False
 
 self.legacyFileRe = 
re.compile(r'^(zero\.tsv\.log-(\d+)\.gz)__\d+\.tsv$', re.IGNORECASE)
 
@@ -86,6 +87,8 @@
 else:
 raise ValueError('Unrecognized key (%s) in file %s' % 
(joinValues(vals), f))
 (dt, typ, xcs, via, ipset, https, lang, subdomain, site, 
count) = vals
+
+via = via.upper()
 
 error = False
 if xcs == '404-01b':
@@ -151,7 +154,8 @@
 # 0  1  2   34  56 7
 # 250-99 DIRECT default http ru zero wikipedia 1000
 (xcs, via, ipset, https, lang, subdomain, site, count) = 
vals
-via = via if via else u'DIRECT'
+
+via = via.upper() if via else u'DIRECT'
 ipset = ipset if ipset else u'default'
 https = https if https else u'http'
 
@@ -195,9 +199,9 @@
 
 if legacyStats:
 # Only add legacy data for dates that we haven't seen in hadoop