[MediaWiki-commits] [Gerrit] create table if doesn't exist, runner script - change (analytics/zero-sms)
Yurik has uploaded a new change for review. https://gerrit.wikimedia.org/r/168168 Change subject: create table if doesn't exist, runner script .. create table if doesn't exist, runner script Change-Id: I29f04352801dabc312d08664e9521ab115d5e498 --- A scripts/run-hivezero.sh M scripts/zero-counts.hql 2 files changed, 64 insertions(+), 11 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/zero-sms refs/changes/68/168168/1 diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh new file mode 100644 index 000..c1a8e96 --- /dev/null +++ b/scripts/run-hivezero.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +if [[ -z $4 ]]; then + last=$3 +else + last=$4 +fi + +for ((day = $3; day = $last; day++)); do + printf -v p %04d-%02d-%02d $1 $2 $day + echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d date=$p + hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d date=$p +done diff --git a/scripts/zero-counts.hql b/scripts/zero-counts.hql index 67a7fb6..2e7d0db 100644 --- a/scripts/zero-counts.hql +++ b/scripts/zero-counts.hql @@ -5,25 +5,38 @@ -- Extracts zero stats from webrequests into a separate table -- -- Usage: --- hive -f zero-counts.hql -d year=2014 -d month=9 -d day=15 +-- hive -f zero-counts.hql -d year=2014 -d month=9 -d day=15 -d date=2014-09-15 +-- Date is duplicated because I haven't figured an easy way to set date=printf() -- --- Range usage: --- cat inp.txt | xargs -I % hive -f zero-counts.hql -d year=2014 -d month=10 -d day=% --- or --- for i in {1..5}; do hive -f zero-counts.hql -d year=2014 -d month=10 -d day=$i; done - -- set hivevar:year=2014; -- set hivevar:month=10; -- set hivevar:day=21; +-- set hivevar:date=2014-10-21; + +use yurik; + + +CREATE TABLE IF NOT EXISTS yurik.zero_webstats ( + xcs string, + via string, + ipset string, + https string, + lang string, + subdomain string, + site string, + count bigint) +PARTITIONED BY ( + date string) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY '\t'; INSERT OVERWRITE TABLE yurik.zero_webstats -PARTITION(date) IF NOT EXISTS +PARTITION(date=${date}) IF NOT EXISTS SELECT -xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count, date +xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count FROM ( SELECT -printf('%d-%02d-%02d', ${year}, ${month}, ${day}) date, regexp_extract(x_analytics, 'zero=([^\;]+)') xcs, regexp_extract(x_analytics, 'proxy=([^\;]+)') via, regexp_extract(x_analytics, 'zeronet=([^\;]+)') ipset, @@ -58,5 +71,32 @@ AND NOT (SPLIT(TRANSLATE(SUBSTR(uri_path, 7), ' ', '_'), '#')[0] RLIKE '^[Uu]ndefined$') ) prepared -GROUP BY date, xcs, via, ipset, https, lang, subdomain, site -DISTRIBUTE BY date; +GROUP BY xcs, via, ipset, https, lang, subdomain, site +DISTRIBUTE BY printf('%d-%02d-%02d', ${year}, ${month}, ${day}); + + +-- CREATE TABLE IF NOT EXISTS yurik.zero_webstats_sum ( +-- date string, +-- tag string, +-- count bigint) +-- PARTITIONED BY ( +-- xcs string) +-- ROW FORMAT DELIMITED +-- FIELDS TERMINATED BY '\t'; + +-- xcs string, +-- via string, +-- ipset string, +-- https string, +-- lang string, +-- subdomain string, +-- site string, +-- count bigint) + +-- INSERT OVERWRITE TABLE yurik.zero_webstats_sum +-- PARTITION(xcs) +-- SELECT +-- date, tag, COUNT(*) count, xcs +-- FROM yurik.zero_webstats +-- GROUP BY xcs, date, tag +-- DISTRIBUTE BY xcs; -- To view, visit https://gerrit.wikimedia.org/r/168168 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I29f04352801dabc312d08664e9521ab115d5e498 Gerrit-PatchSet: 1 Gerrit-Project: analytics/zero-sms Gerrit-Branch: master Gerrit-Owner: Yurik yu...@wikimedia.org ___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
[MediaWiki-commits] [Gerrit] create table if doesn't exist, runner script - change (analytics/zero-sms)
Yurik has submitted this change and it was merged. Change subject: create table if doesn't exist, runner script .. create table if doesn't exist, runner script Change-Id: I29f04352801dabc312d08664e9521ab115d5e498 --- A scripts/run-hivezero.sh M scripts/zero-counts.hql 2 files changed, 64 insertions(+), 11 deletions(-) Approvals: Yurik: Verified; Looks good to me, approved diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh new file mode 100644 index 000..c1a8e96 --- /dev/null +++ b/scripts/run-hivezero.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +if [[ -z $4 ]]; then + last=$3 +else + last=$4 +fi + +for ((day = $3; day = $last; day++)); do + printf -v p %04d-%02d-%02d $1 $2 $day + echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d date=$p + hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d date=$p +done diff --git a/scripts/zero-counts.hql b/scripts/zero-counts.hql index 67a7fb6..2e7d0db 100644 --- a/scripts/zero-counts.hql +++ b/scripts/zero-counts.hql @@ -5,25 +5,38 @@ -- Extracts zero stats from webrequests into a separate table -- -- Usage: --- hive -f zero-counts.hql -d year=2014 -d month=9 -d day=15 +-- hive -f zero-counts.hql -d year=2014 -d month=9 -d day=15 -d date=2014-09-15 +-- Date is duplicated because I haven't figured an easy way to set date=printf() -- --- Range usage: --- cat inp.txt | xargs -I % hive -f zero-counts.hql -d year=2014 -d month=10 -d day=% --- or --- for i in {1..5}; do hive -f zero-counts.hql -d year=2014 -d month=10 -d day=$i; done - -- set hivevar:year=2014; -- set hivevar:month=10; -- set hivevar:day=21; +-- set hivevar:date=2014-10-21; + +use yurik; + + +CREATE TABLE IF NOT EXISTS yurik.zero_webstats ( + xcs string, + via string, + ipset string, + https string, + lang string, + subdomain string, + site string, + count bigint) +PARTITIONED BY ( + date string) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY '\t'; INSERT OVERWRITE TABLE yurik.zero_webstats -PARTITION(date) IF NOT EXISTS +PARTITION(date=${date}) IF NOT EXISTS SELECT -xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count, date +xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count FROM ( SELECT -printf('%d-%02d-%02d', ${year}, ${month}, ${day}) date, regexp_extract(x_analytics, 'zero=([^\;]+)') xcs, regexp_extract(x_analytics, 'proxy=([^\;]+)') via, regexp_extract(x_analytics, 'zeronet=([^\;]+)') ipset, @@ -58,5 +71,32 @@ AND NOT (SPLIT(TRANSLATE(SUBSTR(uri_path, 7), ' ', '_'), '#')[0] RLIKE '^[Uu]ndefined$') ) prepared -GROUP BY date, xcs, via, ipset, https, lang, subdomain, site -DISTRIBUTE BY date; +GROUP BY xcs, via, ipset, https, lang, subdomain, site +DISTRIBUTE BY printf('%d-%02d-%02d', ${year}, ${month}, ${day}); + + +-- CREATE TABLE IF NOT EXISTS yurik.zero_webstats_sum ( +-- date string, +-- tag string, +-- count bigint) +-- PARTITIONED BY ( +-- xcs string) +-- ROW FORMAT DELIMITED +-- FIELDS TERMINATED BY '\t'; + +-- xcs string, +-- via string, +-- ipset string, +-- https string, +-- lang string, +-- subdomain string, +-- site string, +-- count bigint) + +-- INSERT OVERWRITE TABLE yurik.zero_webstats_sum +-- PARTITION(xcs) +-- SELECT +-- date, tag, COUNT(*) count, xcs +-- FROM yurik.zero_webstats +-- GROUP BY xcs, date, tag +-- DISTRIBUTE BY xcs; -- To view, visit https://gerrit.wikimedia.org/r/168168 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I29f04352801dabc312d08664e9521ab115d5e498 Gerrit-PatchSet: 1 Gerrit-Project: analytics/zero-sms Gerrit-Branch: master Gerrit-Owner: Yurik yu...@wikimedia.org Gerrit-Reviewer: Yurik yu...@wikimedia.org ___ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits