Yurik has uploaded a new change for review.
https://gerrit.wikimedia.org/r/168057
Change subject: Adapted analytics pageview counter
......................................................................
Adapted analytics pageview counter
Change-Id: I342b1360034c4f11c0c0f9ad2d964b43f890c0c4
---
A scripts/zero-counts.hql
1 file changed, 65 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/zero-sms
refs/changes/57/168057/1
diff --git a/scripts/zero-counts.hql b/scripts/zero-counts.hql
new file mode 100644
index 0000000..170cca5
--- /dev/null
+++ b/scripts/zero-counts.hql
@@ -0,0 +1,65 @@
+set hive.exec.dynamic.partition.mode=nonstrict;
+SET hive.exec.compress.output=true;
+--^ To work around HIVE-3296, we have SETs before any comments
+
+-- Extracts zero stats from webrequests into a separate table
+--
+-- Usage:
+-- hive -f zero-counts.hql \
+-- -d year=2014 \
+-- -d month=9 \
+-- -d day=15 \
+-- -d hour=20
+--
+
+-- set hivevar:year=2014;
+-- set hivevar:month=10;
+-- set hivevar:day=21;
+-- set hivevar:hour=1;
+-- printf('%d-%02d-%02d', ${year}, ${month}, ${day}) date,
+;
+
+INSERT OVERWRITE TABLE yurik.zero_webstats
+ PARTITION(date)
+ SELECT
+ xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count, date
+ FROM (
+ SELECT
+ printf('%d-%02d-%02d', ${year}, ${month}, ${day}) date,
+ regexp_extract(x_analytics, 'zero=([^\;]+)') xcs,
+ regexp_extract(x_analytics, 'proxy=([^\;]+)') via,
+ regexp_extract(x_analytics, 'zeronet=([^\;]+)') ipset,
+ if (x_analytics LIKE '%https=1%', 'https', '') https,
+ regexp_extract(uri_host,
'^([A-Za-z0-9-]+)(\\.(zero|m))?\\.([a-z]*)\\.org$', 1) lang,
+ regexp_extract(uri_host,
'^([A-Za-z0-9-]+)(\\.(zero|m))?\\.([a-z]*)\\.org$', 3) subdomain,
+ regexp_extract(uri_host,
'^([A-Za-z0-9-]+)(\\.(zero|m))?\\.([a-z]*)\\.org$', 4) site
+
+ FROM wmf_raw.webrequest
+ WHERE
+ webrequest_source IN ('text', 'mobile')
+ AND year=${year}
+ AND month=${month}
+ AND day=${day}
+ AND hour=2
+ AND x_analytics LIKE '%zero=%'
+ AND SUBSTR(uri_path, 1, 6) = '/wiki/'
+ AND (
+ (
+ SUBSTR(ip, 1, 9) != '10.128.0.'
+ AND SUBSTR(ip, 1, 11) NOT IN (
+ '208.80.152.',
+ '208.80.153.',
+ '208.80.154.',
+ '208.80.155.',
+ '91.198.174.'
+ )
+ ) OR x_forwarded_for != '-'
+ )
+ AND SUBSTR(uri_path, 1, 31) != '/wiki/Special:CentralAutoLogin/'
+ AND http_status NOT IN ( '301', '302', '303' )
+ AND uri_host RLIKE '^[A-Za-z0-9-]+(\\.(zero|m))?\\.[a-z]*\\.org$'
+ AND NOT (SPLIT(TRANSLATE(SUBSTR(uri_path, 7), ' ', '_'), '#')[0]
RLIKE '^[Uu]ndefined$')
+
+ ) prepared
+ GROUP BY date, xcs, via, ipset, https, lang, subdomain, site
+ DISTRIBUTE BY date;
--
To view, visit https://gerrit.wikimedia.org/r/168057
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I342b1360034c4f11c0c0f9ad2d964b43f890c0c4
Gerrit-PatchSet: 1
Gerrit-Project: analytics/zero-sms
Gerrit-Branch: master
Gerrit-Owner: Yurik <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits