[MediaWiki-commits] [Gerrit] Updated weblogs2 to create single pages - change (analytics/zero-sms)
Yurik has uploaded a new change for review. https://gerrit.wikimedia.org/r/172661 Change subject: Updated weblogs2 to create single pages .. Updated weblogs2 to create single pages Change-Id: Ic2789a8478b01ee3117385f5e44eaf383c0db9f3 --- M scripts/run-hivezero.sh M scripts/weblogs2.py 2 files changed, 48 insertions(+), 80 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/zero-sms refs/changes/61/172661/1 diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh index 2c8f95c..7688adc 100755 --- a/scripts/run-hivezero.sh +++ b/scripts/run-hivezero.sh @@ -31,7 +31,7 @@ if [ $( date -d $date +%F 21 | grep invalid ) = ] ; then if [[ $table == 'wmf_raw.webrequest' ]]; then - path=/mnt/hdfs/wmf/data/raw/webrequest/webrequest_upload/hourly/$year/$month/$day/23 + path=/mnt/hdfs/wmf/data/raw/webrequest/webrequest_upload/hourly/$year/$(printf %02d $month)/$(printf %02d $day)/23 else path=/mnt/hdfs/user/hive/warehouse/yurik.db/$table/year=$year/month=$month/day=$day fi @@ -45,12 +45,14 @@ continue fi - if [[ $6 -eq overwrite ]]; then - hive -e use yurik; ALTER TABLE zero_webstats DROP IF EXISTS PARTITION(date = '$date'); - else - path=/mnt/hdfs/user/hive/warehouse/yurik.db/zero_webstats/date=$date - echo * Checking if '$path' exists - if [ -d $path ]; then + path=/mnt/hdfs/user/hive/warehouse/yurik.db/zero_webstats/date=$date + echo * Checking if '$path' exists + if [ -d $path ]; then + if [ $6 == overwrite ]; then + echo * Droping partition '$date' + hive -e use yurik; ALTER TABLE zero_webstats DROP IF EXISTS PARTITION(date = '$date'); + else + echo * Skipping '$date' continue fi fi diff --git a/scripts/weblogs2.py b/scripts/weblogs2.py index 1e79027..1e0f727 100644 --- a/scripts/weblogs2.py +++ b/scripts/weblogs2.py @@ -4,7 +4,7 @@ import collections from pandas import read_table, pivot_table -from pandas.core.frame import DataFrame, Series +from pandas.core.frame import DataFrame import numpy as np from logprocessor import * @@ -58,13 +58,13 @@ ignoreViaBefore = datetime(2014, 3, 22) configs = self.downloadConfigs() stats = collections.defaultdict(int) -for dateDir in os.listdir(self.pathCache): +for dateDir in os.listdir(self.pathLogs): m = self.dateDirRe.match(dateDir) if not m: continue dateStr = m.group(1) dt = datetime.strptime(dateStr, '%Y-%m-%d') -datePath = os.path.join(self.pathCache, dateDir) +datePath = os.path.join(self.pathLogs, dateDir) for f in os.listdir(datePath): if not self.fileRe.match(f): continue @@ -136,90 +136,56 @@ xcsList = [xcs for xcs in allData.xcs.unique() if xcs != 'ERROR' and xcs[0:4] != 'TEST'] # filter type==DATA and site==wikipedia -allData = allData[(allData['xcs'].isin(xcsList)) (allData['site'] == 'wikipedia')] -# filter out last date -lastDate = allData.date.max() -df = allData[allData.date lastDate] +df = allData[(allData['xcs'].isin(xcsList)) (allData['site'] == 'wikipedia')] -allowedSubdomains = ['m', 'zero'] -limnCompat = df[(df.ison == 'y') (df.iszero == 'y') (df.subdomain.isin(allowedSubdomains))] s = StringIO.StringIO() -pivot_table(limnCompat, 'count', ['date', 'xcs', 'subdomain'], aggfunc=np.sum).to_csv(s, header=True) -result = s.getvalue() +allowedSubdomains = ['m', 'zero'] +dailySubdomains = df[(df.ison == 'y') (df.iszero == 'y') (df.subdomain.isin(allowedSubdomains))] +pivot_table(dailySubdomains, 'count', ['date', 'xcs', 'subdomain'], aggfunc=np.sum).to_csv(s, header=False) wiki( 'edit', title='RawData:DailySubdomains', summary='refreshing data', -text=result, +text='date,xcs,subdomain,count\n' + s.getvalue(), token=wiki.token() ) -# allEnabled = df[(df.ison == 'y') (df.iszero == 'y')] -# s = StringIO.StringIO() -# pivot_table(allEnabled, 'count', ['date', 'xcs'], aggfunc=np.sum).to_csv(s, header=True) -# result = s.getvalue() -# -# wiki( -# 'edit', -# title='RawData:AllEnabled', -# summary='refreshing data',
[MediaWiki-commits] [Gerrit] Updated weblogs2 to create single pages - change (analytics/zero-sms)
Yurik has submitted this change and it was merged. Change subject: Updated weblogs2 to create single pages .. Updated weblogs2 to create single pages Change-Id: Ic2789a8478b01ee3117385f5e44eaf383c0db9f3 --- M scripts/run-hivezero.sh M scripts/weblogs2.py 2 files changed, 48 insertions(+), 80 deletions(-) Approvals: Yurik: Verified; Looks good to me, approved diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh index 2c8f95c..7688adc 100755 --- a/scripts/run-hivezero.sh +++ b/scripts/run-hivezero.sh @@ -31,7 +31,7 @@ if [ $( date -d $date +%F 21 | grep invalid ) = ] ; then if [[ $table == 'wmf_raw.webrequest' ]]; then - path=/mnt/hdfs/wmf/data/raw/webrequest/webrequest_upload/hourly/$year/$month/$day/23 + path=/mnt/hdfs/wmf/data/raw/webrequest/webrequest_upload/hourly/$year/$(printf %02d $month)/$(printf %02d $day)/23 else path=/mnt/hdfs/user/hive/warehouse/yurik.db/$table/year=$year/month=$month/day=$day fi @@ -45,12 +45,14 @@ continue fi - if [[ $6 -eq overwrite ]]; then - hive -e use yurik; ALTER TABLE zero_webstats DROP IF EXISTS PARTITION(date = '$date'); - else - path=/mnt/hdfs/user/hive/warehouse/yurik.db/zero_webstats/date=$date - echo * Checking if '$path' exists - if [ -d $path ]; then + path=/mnt/hdfs/user/hive/warehouse/yurik.db/zero_webstats/date=$date + echo * Checking if '$path' exists + if [ -d $path ]; then + if [ $6 == overwrite ]; then + echo * Droping partition '$date' + hive -e use yurik; ALTER TABLE zero_webstats DROP IF EXISTS PARTITION(date = '$date'); + else + echo * Skipping '$date' continue fi fi diff --git a/scripts/weblogs2.py b/scripts/weblogs2.py index 1e79027..1e0f727 100644 --- a/scripts/weblogs2.py +++ b/scripts/weblogs2.py @@ -4,7 +4,7 @@ import collections from pandas import read_table, pivot_table -from pandas.core.frame import DataFrame, Series +from pandas.core.frame import DataFrame import numpy as np from logprocessor import * @@ -58,13 +58,13 @@ ignoreViaBefore = datetime(2014, 3, 22) configs = self.downloadConfigs() stats = collections.defaultdict(int) -for dateDir in os.listdir(self.pathCache): +for dateDir in os.listdir(self.pathLogs): m = self.dateDirRe.match(dateDir) if not m: continue dateStr = m.group(1) dt = datetime.strptime(dateStr, '%Y-%m-%d') -datePath = os.path.join(self.pathCache, dateDir) +datePath = os.path.join(self.pathLogs, dateDir) for f in os.listdir(datePath): if not self.fileRe.match(f): continue @@ -136,90 +136,56 @@ xcsList = [xcs for xcs in allData.xcs.unique() if xcs != 'ERROR' and xcs[0:4] != 'TEST'] # filter type==DATA and site==wikipedia -allData = allData[(allData['xcs'].isin(xcsList)) (allData['site'] == 'wikipedia')] -# filter out last date -lastDate = allData.date.max() -df = allData[allData.date lastDate] +df = allData[(allData['xcs'].isin(xcsList)) (allData['site'] == 'wikipedia')] -allowedSubdomains = ['m', 'zero'] -limnCompat = df[(df.ison == 'y') (df.iszero == 'y') (df.subdomain.isin(allowedSubdomains))] s = StringIO.StringIO() -pivot_table(limnCompat, 'count', ['date', 'xcs', 'subdomain'], aggfunc=np.sum).to_csv(s, header=True) -result = s.getvalue() +allowedSubdomains = ['m', 'zero'] +dailySubdomains = df[(df.ison == 'y') (df.iszero == 'y') (df.subdomain.isin(allowedSubdomains))] +pivot_table(dailySubdomains, 'count', ['date', 'xcs', 'subdomain'], aggfunc=np.sum).to_csv(s, header=False) wiki( 'edit', title='RawData:DailySubdomains', summary='refreshing data', -text=result, +text='date,xcs,subdomain,count\n' + s.getvalue(), token=wiki.token() ) -# allEnabled = df[(df.ison == 'y') (df.iszero == 'y')] -# s = StringIO.StringIO() -# pivot_table(allEnabled, 'count', ['date', 'xcs'], aggfunc=np.sum).to_csv(s, header=True) -# result = s.getvalue() -# -# wiki( -# 'edit', -# title='RawData:AllEnabled', -# summary='refreshing data', -# text=result, -# token=wiki.token() -