[MediaWiki-commits] [Gerrit] Updated weblogs2 to create single pages - change (analytics/zero-sms)

2014-11-11 Thread Yurik (Code Review)
Yurik has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/172661

Change subject: Updated weblogs2 to create single pages
..

Updated weblogs2 to create single pages

Change-Id: Ic2789a8478b01ee3117385f5e44eaf383c0db9f3
---
M scripts/run-hivezero.sh
M scripts/weblogs2.py
2 files changed, 48 insertions(+), 80 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/zero-sms 
refs/changes/61/172661/1

diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh
index 2c8f95c..7688adc 100755
--- a/scripts/run-hivezero.sh
+++ b/scripts/run-hivezero.sh
@@ -31,7 +31,7 @@
if [ $( date -d $date +%F 21 | grep invalid ) =  ] ; then
 
if [[ $table == 'wmf_raw.webrequest' ]]; then
-   
path=/mnt/hdfs/wmf/data/raw/webrequest/webrequest_upload/hourly/$year/$month/$day/23
+   
path=/mnt/hdfs/wmf/data/raw/webrequest/webrequest_upload/hourly/$year/$(printf 
%02d $month)/$(printf %02d $day)/23
else

path=/mnt/hdfs/user/hive/warehouse/yurik.db/$table/year=$year/month=$month/day=$day
fi
@@ -45,12 +45,14 @@
continue
fi
 
-   if [[ $6 -eq overwrite ]]; then
-   hive -e use yurik; ALTER TABLE zero_webstats DROP IF 
EXISTS PARTITION(date = '$date');
-   else
-   
path=/mnt/hdfs/user/hive/warehouse/yurik.db/zero_webstats/date=$date
-   echo * Checking if '$path' exists
-   if [ -d $path ]; then
+   
path=/mnt/hdfs/user/hive/warehouse/yurik.db/zero_webstats/date=$date
+   echo * Checking if '$path' exists
+   if [ -d $path ]; then
+   if [ $6 == overwrite ]; then
+   echo * Droping partition '$date'
+   hive -e use yurik; ALTER TABLE zero_webstats 
DROP IF EXISTS PARTITION(date = '$date');
+   else
+   echo * Skipping '$date'
continue
fi
fi
diff --git a/scripts/weblogs2.py b/scripts/weblogs2.py
index 1e79027..1e0f727 100644
--- a/scripts/weblogs2.py
+++ b/scripts/weblogs2.py
@@ -4,7 +4,7 @@
 import collections
 
 from pandas import read_table, pivot_table
-from pandas.core.frame import DataFrame, Series
+from pandas.core.frame import DataFrame
 import numpy as np
 
 from logprocessor import *
@@ -58,13 +58,13 @@
 ignoreViaBefore = datetime(2014, 3, 22)
 configs = self.downloadConfigs()
 stats = collections.defaultdict(int)
-for dateDir in os.listdir(self.pathCache):
+for dateDir in os.listdir(self.pathLogs):
 m = self.dateDirRe.match(dateDir)
 if not m:
 continue
 dateStr = m.group(1)
 dt = datetime.strptime(dateStr, '%Y-%m-%d')
-datePath = os.path.join(self.pathCache, dateDir)
+datePath = os.path.join(self.pathLogs, dateDir)
 for f in os.listdir(datePath):
 if not self.fileRe.match(f):
 continue
@@ -136,90 +136,56 @@
 xcsList = [xcs for xcs in allData.xcs.unique() if xcs != 'ERROR' and 
xcs[0:4] != 'TEST']
 
 # filter type==DATA and site==wikipedia
-allData = allData[(allData['xcs'].isin(xcsList))  (allData['site'] == 
'wikipedia')]
-# filter out last date
-lastDate = allData.date.max()
-df = allData[allData.date  lastDate]
+df = allData[(allData['xcs'].isin(xcsList))  (allData['site'] == 
'wikipedia')]
 
-allowedSubdomains = ['m', 'zero']
-limnCompat = df[(df.ison == 'y')  (df.iszero == 'y')  
(df.subdomain.isin(allowedSubdomains))]
 s = StringIO.StringIO()
-pivot_table(limnCompat, 'count', ['date', 'xcs', 'subdomain'], 
aggfunc=np.sum).to_csv(s, header=True)
-result = s.getvalue()
+allowedSubdomains = ['m', 'zero']
+dailySubdomains = df[(df.ison == 'y')  (df.iszero == 'y')  
(df.subdomain.isin(allowedSubdomains))]
+pivot_table(dailySubdomains, 'count', ['date', 'xcs', 'subdomain'], 
aggfunc=np.sum).to_csv(s, header=False)
 
 wiki(
 'edit',
 title='RawData:DailySubdomains',
 summary='refreshing data',
-text=result,
+text='date,xcs,subdomain,count\n' + s.getvalue(),
 token=wiki.token()
 )
-# allEnabled = df[(df.ison == 'y')  (df.iszero == 'y')]
-# s = StringIO.StringIO()
-# pivot_table(allEnabled, 'count', ['date', 'xcs'], 
aggfunc=np.sum).to_csv(s, header=True)
-# result = s.getvalue()
-#
-# wiki(
-# 'edit',
-# title='RawData:AllEnabled',
-# summary='refreshing data',

[MediaWiki-commits] [Gerrit] Updated weblogs2 to create single pages - change (analytics/zero-sms)

2014-11-11 Thread Yurik (Code Review)
Yurik has submitted this change and it was merged.

Change subject: Updated weblogs2 to create single pages
..


Updated weblogs2 to create single pages

Change-Id: Ic2789a8478b01ee3117385f5e44eaf383c0db9f3
---
M scripts/run-hivezero.sh
M scripts/weblogs2.py
2 files changed, 48 insertions(+), 80 deletions(-)

Approvals:
  Yurik: Verified; Looks good to me, approved



diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh
index 2c8f95c..7688adc 100755
--- a/scripts/run-hivezero.sh
+++ b/scripts/run-hivezero.sh
@@ -31,7 +31,7 @@
if [ $( date -d $date +%F 21 | grep invalid ) =  ] ; then
 
if [[ $table == 'wmf_raw.webrequest' ]]; then
-   
path=/mnt/hdfs/wmf/data/raw/webrequest/webrequest_upload/hourly/$year/$month/$day/23
+   
path=/mnt/hdfs/wmf/data/raw/webrequest/webrequest_upload/hourly/$year/$(printf 
%02d $month)/$(printf %02d $day)/23
else

path=/mnt/hdfs/user/hive/warehouse/yurik.db/$table/year=$year/month=$month/day=$day
fi
@@ -45,12 +45,14 @@
continue
fi
 
-   if [[ $6 -eq overwrite ]]; then
-   hive -e use yurik; ALTER TABLE zero_webstats DROP IF 
EXISTS PARTITION(date = '$date');
-   else
-   
path=/mnt/hdfs/user/hive/warehouse/yurik.db/zero_webstats/date=$date
-   echo * Checking if '$path' exists
-   if [ -d $path ]; then
+   
path=/mnt/hdfs/user/hive/warehouse/yurik.db/zero_webstats/date=$date
+   echo * Checking if '$path' exists
+   if [ -d $path ]; then
+   if [ $6 == overwrite ]; then
+   echo * Droping partition '$date'
+   hive -e use yurik; ALTER TABLE zero_webstats 
DROP IF EXISTS PARTITION(date = '$date');
+   else
+   echo * Skipping '$date'
continue
fi
fi
diff --git a/scripts/weblogs2.py b/scripts/weblogs2.py
index 1e79027..1e0f727 100644
--- a/scripts/weblogs2.py
+++ b/scripts/weblogs2.py
@@ -4,7 +4,7 @@
 import collections
 
 from pandas import read_table, pivot_table
-from pandas.core.frame import DataFrame, Series
+from pandas.core.frame import DataFrame
 import numpy as np
 
 from logprocessor import *
@@ -58,13 +58,13 @@
 ignoreViaBefore = datetime(2014, 3, 22)
 configs = self.downloadConfigs()
 stats = collections.defaultdict(int)
-for dateDir in os.listdir(self.pathCache):
+for dateDir in os.listdir(self.pathLogs):
 m = self.dateDirRe.match(dateDir)
 if not m:
 continue
 dateStr = m.group(1)
 dt = datetime.strptime(dateStr, '%Y-%m-%d')
-datePath = os.path.join(self.pathCache, dateDir)
+datePath = os.path.join(self.pathLogs, dateDir)
 for f in os.listdir(datePath):
 if not self.fileRe.match(f):
 continue
@@ -136,90 +136,56 @@
 xcsList = [xcs for xcs in allData.xcs.unique() if xcs != 'ERROR' and 
xcs[0:4] != 'TEST']
 
 # filter type==DATA and site==wikipedia
-allData = allData[(allData['xcs'].isin(xcsList))  (allData['site'] == 
'wikipedia')]
-# filter out last date
-lastDate = allData.date.max()
-df = allData[allData.date  lastDate]
+df = allData[(allData['xcs'].isin(xcsList))  (allData['site'] == 
'wikipedia')]
 
-allowedSubdomains = ['m', 'zero']
-limnCompat = df[(df.ison == 'y')  (df.iszero == 'y')  
(df.subdomain.isin(allowedSubdomains))]
 s = StringIO.StringIO()
-pivot_table(limnCompat, 'count', ['date', 'xcs', 'subdomain'], 
aggfunc=np.sum).to_csv(s, header=True)
-result = s.getvalue()
+allowedSubdomains = ['m', 'zero']
+dailySubdomains = df[(df.ison == 'y')  (df.iszero == 'y')  
(df.subdomain.isin(allowedSubdomains))]
+pivot_table(dailySubdomains, 'count', ['date', 'xcs', 'subdomain'], 
aggfunc=np.sum).to_csv(s, header=False)
 
 wiki(
 'edit',
 title='RawData:DailySubdomains',
 summary='refreshing data',
-text=result,
+text='date,xcs,subdomain,count\n' + s.getvalue(),
 token=wiki.token()
 )
-# allEnabled = df[(df.ison == 'y')  (df.iszero == 'y')]
-# s = StringIO.StringIO()
-# pivot_table(allEnabled, 'count', ['date', 'xcs'], 
aggfunc=np.sum).to_csv(s, header=True)
-# result = s.getvalue()
-#
-# wiki(
-# 'edit',
-# title='RawData:AllEnabled',
-# summary='refreshing data',
-# text=result,
-# token=wiki.token()
-