Nuria has submitted this change and it was merged.
Change subject: Correct last_access_uniques oozie jobs
......................................................................
Correct last_access_uniques oozie jobs
Force oozie to let hive choosae the number of reducers to
use. By default, it restricts it to 1.
Remove ordering as it imposes all data going through only on
reducer at sort time, and data will be re-sorted anyway at
hive query time.
Change-Id: Ife61a7bf02478b1d367af6aa19c7df3c89eb63f9
---
M oozie/last_access_uniques/daily/last_access_uniques_daily.hql
M oozie/last_access_uniques/daily/workflow.xml
M oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
M oozie/last_access_uniques/monthly/workflow.xml
4 files changed, 18 insertions(+), 10 deletions(-)
Approvals:
Nuria: Verified; Looks good to me, approved
diff --git a/oozie/last_access_uniques/daily/last_access_uniques_daily.hql
b/oozie/last_access_uniques/daily/last_access_uniques_daily.hql
index 454a098..3633e8a 100644
--- a/oozie/last_access_uniques/daily/last_access_uniques_daily.hql
+++ b/oozie/last_access_uniques/daily/last_access_uniques_daily.hql
@@ -111,8 +111,7 @@
fresh.uniques_offset
-- TODO
-- Add HAVING clause to restrict on long tail (maybe ?)
-ORDER BY
- uniques_estimate DESC
+--
-- Limit enforced by hive strict mapreduce setting.
-- 1000000000 == NO LIMIT !
LIMIT 1000000000;
diff --git a/oozie/last_access_uniques/daily/workflow.xml
b/oozie/last_access_uniques/daily/workflow.xml
index 6022d0f..d875f18 100644
--- a/oozie/last_access_uniques/daily/workflow.xml
+++ b/oozie/last_access_uniques/daily/workflow.xml
@@ -72,6 +72,11 @@
<name>mapreduce.job.queuename</name>
<value>${queue_name}</value>
</property>
+ <!--Let hive decide on the number of reducers -->
+ <property>
+ <name>mapred.reduce.tasks</name>
+ <value>-1</value>
+ </property>
<property>
<name>hive.exec.scratchdir</name>
<value>/tmp/hive-${user}</value>
diff --git a/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
b/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
index 94a606b..4dad91d 100644
--- a/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
+++ b/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
@@ -86,7 +86,7 @@
-- Other cases, don't
ELSE 0
END) AS uniques_underestimate,
- oenca.uniques_offset AS uniques_offset,
+ fresh.uniques_offset AS uniques_offset,
SUM(CASE
-- Last access not set and client accept cookies --> first visit, count
WHEN (la.last_access IS NULL AND la.nocookies is NULL) THEN 1
@@ -95,21 +95,20 @@
AND (la.last_access < unix_timestamp(CONCAT('${year}-',
LPAD('${month}', 2, '0'), '-01'), 'yyyy-MM-dd'))) THEN 1
-- Other cases, don't
ELSE 0
- END) + oenca.uniques_offset AS uniques_estimate
+ END) + fresh.uniques_offset AS uniques_estimate
FROM
last_access_dates AS la
- INNER JOIN fresh_sessions_aggregated AS oenca
- ON (oenca.uri_host = la.uri_host
- AND oenca.country_code = la.country_code)
+ INNER JOIN fresh_sessions_aggregated AS fresh
+ ON (fresh.uri_host = la.uri_host
+ AND fresh.country_code = la.country_code)
GROUP BY
la.uri_host,
la.country,
la.country_code,
- oenca.uniques_offset
+ fresh.uniques_offset
-- TODO
-- Add HAVING clause to restrict on long tail (maybe ?)
-ORDER BY
- uniques_estimate DESC
+--
-- Limit enforced by hive strict mapreduce setting.
-- 1000000000 == NO LIMIT !
LIMIT 1000000000;
diff --git a/oozie/last_access_uniques/monthly/workflow.xml
b/oozie/last_access_uniques/monthly/workflow.xml
index 0de7763..af141cc 100644
--- a/oozie/last_access_uniques/monthly/workflow.xml
+++ b/oozie/last_access_uniques/monthly/workflow.xml
@@ -68,6 +68,11 @@
<name>mapreduce.job.queuename</name>
<value>${queue_name}</value>
</property>
+ <!--Let hive decide on the number of reducers -->
+ <property>
+ <name>mapred.reduce.tasks</name>
+ <value>-1</value>
+ </property>
<property>
<name>hive.exec.scratchdir</name>
<value>/tmp/hive-${user}</value>
--
To view, visit https://gerrit.wikimedia.org/r/271550
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ife61a7bf02478b1d367af6aa19c7df3c89eb63f9
Gerrit-PatchSet: 2
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Madhuvishy <[email protected]>
Gerrit-Reviewer: Mforns <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits