[MediaWiki-commits] [Gerrit] create table if doesn't exist, runner script - change (analytics/zero-sms)

2014-10-22 Thread Yurik (Code Review)
Yurik has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/168168

Change subject: create table if doesn't exist, runner script
..

create table if doesn't exist, runner script

Change-Id: I29f04352801dabc312d08664e9521ab115d5e498
---
A scripts/run-hivezero.sh
M scripts/zero-counts.hql
2 files changed, 64 insertions(+), 11 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/zero-sms 
refs/changes/68/168168/1

diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh
new file mode 100644
index 000..c1a8e96
--- /dev/null
+++ b/scripts/run-hivezero.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+if [[ -z $4 ]]; then
+   last=$3
+else
+   last=$4
+fi
+
+for ((day = $3; day = $last; day++)); do
+   printf -v p %04d-%02d-%02d $1 $2 $day
+   echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d 
date=$p
+   hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d 
date=$p
+done
diff --git a/scripts/zero-counts.hql b/scripts/zero-counts.hql
index 67a7fb6..2e7d0db 100644
--- a/scripts/zero-counts.hql
+++ b/scripts/zero-counts.hql
@@ -5,25 +5,38 @@
 -- Extracts zero stats from webrequests into a separate table
 --
 -- Usage:
--- hive -f zero-counts.hql -d year=2014 -d month=9 -d day=15
+-- hive -f zero-counts.hql -d year=2014 -d month=9 -d day=15 -d 
date=2014-09-15
+-- Date is duplicated because I haven't figured an easy way to set 
date=printf()
 --
--- Range usage:
---  cat inp.txt | xargs -I % hive -f zero-counts.hql -d year=2014 -d 
month=10 -d day=%
--- or
---  for i in {1..5}; do hive -f zero-counts.hql -d year=2014 -d month=10 
-d day=$i; done
-
 -- set hivevar:year=2014;
 -- set hivevar:month=10;
 -- set hivevar:day=21;
+-- set hivevar:date=2014-10-21;
+
+use yurik;
+
+
+CREATE TABLE IF NOT EXISTS yurik.zero_webstats (
+  xcs string,
+  via string,
+  ipset string,
+  https string,
+  lang string,
+  subdomain string,
+  site string,
+  count bigint)
+PARTITIONED BY (
+  date string)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '\t';
 
 
 INSERT OVERWRITE TABLE yurik.zero_webstats
-PARTITION(date) IF NOT EXISTS
+PARTITION(date=${date}) IF NOT EXISTS
 SELECT
-xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count, date
+xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count
 FROM (
 SELECT
-printf('%d-%02d-%02d', ${year}, ${month}, ${day}) date,
 regexp_extract(x_analytics, 'zero=([^\;]+)') xcs,
 regexp_extract(x_analytics, 'proxy=([^\;]+)') via,
 regexp_extract(x_analytics, 'zeronet=([^\;]+)') ipset,
@@ -58,5 +71,32 @@
 AND NOT (SPLIT(TRANSLATE(SUBSTR(uri_path, 7), ' ', '_'), '#')[0] 
RLIKE '^[Uu]ndefined$')
 
 ) prepared
-GROUP BY date, xcs, via, ipset, https, lang, subdomain, site
-DISTRIBUTE BY date;
+GROUP BY xcs, via, ipset, https, lang, subdomain, site
+DISTRIBUTE BY printf('%d-%02d-%02d', ${year}, ${month}, ${day});
+
+
+-- CREATE TABLE IF NOT EXISTS yurik.zero_webstats_sum (
+--   date string,
+--   tag string,
+--   count bigint)
+-- PARTITIONED BY (
+--   xcs string)
+-- ROW FORMAT DELIMITED
+--   FIELDS TERMINATED BY '\t';
+
+--   xcs string,
+--   via string,
+--   ipset string,
+--   https string,
+--   lang string,
+--   subdomain string,
+--   site string,
+--   count bigint)
+
+-- INSERT OVERWRITE TABLE yurik.zero_webstats_sum
+-- PARTITION(xcs)
+-- SELECT
+-- date, tag, COUNT(*) count, xcs
+-- FROM yurik.zero_webstats
+-- GROUP BY xcs, date, tag
+-- DISTRIBUTE BY xcs;

-- 
To view, visit https://gerrit.wikimedia.org/r/168168
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I29f04352801dabc312d08664e9521ab115d5e498
Gerrit-PatchSet: 1
Gerrit-Project: analytics/zero-sms
Gerrit-Branch: master
Gerrit-Owner: Yurik yu...@wikimedia.org

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] create table if doesn't exist, runner script - change (analytics/zero-sms)

2014-10-22 Thread Yurik (Code Review)
Yurik has submitted this change and it was merged.

Change subject: create table if doesn't exist, runner script
..


create table if doesn't exist, runner script

Change-Id: I29f04352801dabc312d08664e9521ab115d5e498
---
A scripts/run-hivezero.sh
M scripts/zero-counts.hql
2 files changed, 64 insertions(+), 11 deletions(-)

Approvals:
  Yurik: Verified; Looks good to me, approved



diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh
new file mode 100644
index 000..c1a8e96
--- /dev/null
+++ b/scripts/run-hivezero.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+if [[ -z $4 ]]; then
+   last=$3
+else
+   last=$4
+fi
+
+for ((day = $3; day = $last; day++)); do
+   printf -v p %04d-%02d-%02d $1 $2 $day
+   echo hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d 
date=$p
+   hive -f zero-counts.hql -d year=$1 -d month=$2 -d day=$3 -d 
date=$p
+done
diff --git a/scripts/zero-counts.hql b/scripts/zero-counts.hql
index 67a7fb6..2e7d0db 100644
--- a/scripts/zero-counts.hql
+++ b/scripts/zero-counts.hql
@@ -5,25 +5,38 @@
 -- Extracts zero stats from webrequests into a separate table
 --
 -- Usage:
--- hive -f zero-counts.hql -d year=2014 -d month=9 -d day=15
+-- hive -f zero-counts.hql -d year=2014 -d month=9 -d day=15 -d 
date=2014-09-15
+-- Date is duplicated because I haven't figured an easy way to set 
date=printf()
 --
--- Range usage:
---  cat inp.txt | xargs -I % hive -f zero-counts.hql -d year=2014 -d 
month=10 -d day=%
--- or
---  for i in {1..5}; do hive -f zero-counts.hql -d year=2014 -d month=10 
-d day=$i; done
-
 -- set hivevar:year=2014;
 -- set hivevar:month=10;
 -- set hivevar:day=21;
+-- set hivevar:date=2014-10-21;
+
+use yurik;
+
+
+CREATE TABLE IF NOT EXISTS yurik.zero_webstats (
+  xcs string,
+  via string,
+  ipset string,
+  https string,
+  lang string,
+  subdomain string,
+  site string,
+  count bigint)
+PARTITIONED BY (
+  date string)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '\t';
 
 
 INSERT OVERWRITE TABLE yurik.zero_webstats
-PARTITION(date) IF NOT EXISTS
+PARTITION(date=${date}) IF NOT EXISTS
 SELECT
-xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count, date
+xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count
 FROM (
 SELECT
-printf('%d-%02d-%02d', ${year}, ${month}, ${day}) date,
 regexp_extract(x_analytics, 'zero=([^\;]+)') xcs,
 regexp_extract(x_analytics, 'proxy=([^\;]+)') via,
 regexp_extract(x_analytics, 'zeronet=([^\;]+)') ipset,
@@ -58,5 +71,32 @@
 AND NOT (SPLIT(TRANSLATE(SUBSTR(uri_path, 7), ' ', '_'), '#')[0] 
RLIKE '^[Uu]ndefined$')
 
 ) prepared
-GROUP BY date, xcs, via, ipset, https, lang, subdomain, site
-DISTRIBUTE BY date;
+GROUP BY xcs, via, ipset, https, lang, subdomain, site
+DISTRIBUTE BY printf('%d-%02d-%02d', ${year}, ${month}, ${day});
+
+
+-- CREATE TABLE IF NOT EXISTS yurik.zero_webstats_sum (
+--   date string,
+--   tag string,
+--   count bigint)
+-- PARTITIONED BY (
+--   xcs string)
+-- ROW FORMAT DELIMITED
+--   FIELDS TERMINATED BY '\t';
+
+--   xcs string,
+--   via string,
+--   ipset string,
+--   https string,
+--   lang string,
+--   subdomain string,
+--   site string,
+--   count bigint)
+
+-- INSERT OVERWRITE TABLE yurik.zero_webstats_sum
+-- PARTITION(xcs)
+-- SELECT
+-- date, tag, COUNT(*) count, xcs
+-- FROM yurik.zero_webstats
+-- GROUP BY xcs, date, tag
+-- DISTRIBUTE BY xcs;

-- 
To view, visit https://gerrit.wikimedia.org/r/168168
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I29f04352801dabc312d08664e9521ab115d5e498
Gerrit-PatchSet: 1
Gerrit-Project: analytics/zero-sms
Gerrit-Branch: master
Gerrit-Owner: Yurik yu...@wikimedia.org
Gerrit-Reviewer: Yurik yu...@wikimedia.org

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits