Joal has uploaded a new change for review.
https://gerrit.wikimedia.org/r/271550
Change subject: Correct last_access_uniques oozie jobs
......................................................................
Correct last_access_uniques oozie jobs
Force oozie to let hive choosae the number of reducers to
use. By default, restrict it to 1 <WTF???>.
Remove ordering as it imposes all data going through only on
reducer at sort time, and data will be re-sorted anyway at
hive query time.
Change-Id: Ife61a7bf02478b1d367af6aa19c7df3c89eb63f9
---
M oozie/last_access_uniques/daily/last_access_uniques_daily.hql
M oozie/last_access_uniques/daily/workflow.xml
M oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
M oozie/last_access_uniques/monthly/workflow.xml
4 files changed, 18 insertions(+), 10 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery
refs/changes/50/271550/1
diff --git a/oozie/last_access_uniques/daily/last_access_uniques_daily.hql
b/oozie/last_access_uniques/daily/last_access_uniques_daily.hql
index 454a098..3633e8a 100644
--- a/oozie/last_access_uniques/daily/last_access_uniques_daily.hql
+++ b/oozie/last_access_uniques/daily/last_access_uniques_daily.hql
@@ -111,8 +111,7 @@
fresh.uniques_offset
-- TODO
-- Add HAVING clause to restrict on long tail (maybe ?)
-ORDER BY
- uniques_estimate DESC
+--
-- Limit enforced by hive strict mapreduce setting.
-- 1000000000 == NO LIMIT !
LIMIT 1000000000;
diff --git a/oozie/last_access_uniques/daily/workflow.xml
b/oozie/last_access_uniques/daily/workflow.xml
index 6022d0f..d875f18 100644
--- a/oozie/last_access_uniques/daily/workflow.xml
+++ b/oozie/last_access_uniques/daily/workflow.xml
@@ -72,6 +72,11 @@
<name>mapreduce.job.queuename</name>
<value>${queue_name}</value>
</property>
+ <!--Let hive decide on the number of reducers -->
+ <property>
+ <name>mapred.reduce.tasks</name>
+ <value>-1</value>
+ </property>
<property>
<name>hive.exec.scratchdir</name>
<value>/tmp/hive-${user}</value>
diff --git a/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
b/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
index 94a606b..4dad91d 100644
--- a/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
+++ b/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
@@ -86,7 +86,7 @@
-- Other cases, don't
ELSE 0
END) AS uniques_underestimate,
- oenca.uniques_offset AS uniques_offset,
+ fresh.uniques_offset AS uniques_offset,
SUM(CASE
-- Last access not set and client accept cookies --> first visit, count
WHEN (la.last_access IS NULL AND la.nocookies is NULL) THEN 1
@@ -95,21 +95,20 @@
AND (la.last_access < unix_timestamp(CONCAT('${year}-',
LPAD('${month}', 2, '0'), '-01'), 'yyyy-MM-dd'))) THEN 1
-- Other cases, don't
ELSE 0
- END) + oenca.uniques_offset AS uniques_estimate
+ END) + fresh.uniques_offset AS uniques_estimate
FROM
last_access_dates AS la
- INNER JOIN fresh_sessions_aggregated AS oenca
- ON (oenca.uri_host = la.uri_host
- AND oenca.country_code = la.country_code)
+ INNER JOIN fresh_sessions_aggregated AS fresh
+ ON (fresh.uri_host = la.uri_host
+ AND fresh.country_code = la.country_code)
GROUP BY
la.uri_host,
la.country,
la.country_code,
- oenca.uniques_offset
+ fresh.uniques_offset
-- TODO
-- Add HAVING clause to restrict on long tail (maybe ?)
-ORDER BY
- uniques_estimate DESC
+--
-- Limit enforced by hive strict mapreduce setting.
-- 1000000000 == NO LIMIT !
LIMIT 1000000000;
diff --git a/oozie/last_access_uniques/monthly/workflow.xml
b/oozie/last_access_uniques/monthly/workflow.xml
index 0de7763..af141cc 100644
--- a/oozie/last_access_uniques/monthly/workflow.xml
+++ b/oozie/last_access_uniques/monthly/workflow.xml
@@ -68,6 +68,11 @@
<name>mapreduce.job.queuename</name>
<value>${queue_name}</value>
</property>
+ <!--Let hive decide on the number of reducers -->
+ <property>
+ <name>mapred.reduce.tasks</name>
+ <value>-1</value>
+ </property>
<property>
<name>hive.exec.scratchdir</name>
<value>/tmp/hive-${user}</value>
--
To view, visit https://gerrit.wikimedia.org/r/271550
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ife61a7bf02478b1d367af6aa19c7df3c89eb63f9
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits