Nuria has submitted this change and it was merged.

Change subject: Correct last_access_uniques oozie jobs
......................................................................


Correct last_access_uniques oozie jobs

Force oozie to let hive choosae the number of reducers to
use. By default, it restricts it to 1.
Remove ordering as it imposes all data going through only on
reducer at sort time, and data will be re-sorted anyway at
hive query time.

Change-Id: Ife61a7bf02478b1d367af6aa19c7df3c89eb63f9
---
M oozie/last_access_uniques/daily/last_access_uniques_daily.hql
M oozie/last_access_uniques/daily/workflow.xml
M oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
M oozie/last_access_uniques/monthly/workflow.xml
4 files changed, 18 insertions(+), 10 deletions(-)

Approvals:
  Nuria: Verified; Looks good to me, approved



diff --git a/oozie/last_access_uniques/daily/last_access_uniques_daily.hql 
b/oozie/last_access_uniques/daily/last_access_uniques_daily.hql
index 454a098..3633e8a 100644
--- a/oozie/last_access_uniques/daily/last_access_uniques_daily.hql
+++ b/oozie/last_access_uniques/daily/last_access_uniques_daily.hql
@@ -111,8 +111,7 @@
     fresh.uniques_offset
 -- TODO
 -- Add HAVING clause to restrict on long tail (maybe ?)
-ORDER BY
-    uniques_estimate DESC
+--
 -- Limit enforced by hive strict mapreduce setting.
 -- 1000000000 == NO LIMIT !
 LIMIT 1000000000;
diff --git a/oozie/last_access_uniques/daily/workflow.xml 
b/oozie/last_access_uniques/daily/workflow.xml
index 6022d0f..d875f18 100644
--- a/oozie/last_access_uniques/daily/workflow.xml
+++ b/oozie/last_access_uniques/daily/workflow.xml
@@ -72,6 +72,11 @@
                     <name>mapreduce.job.queuename</name>
                     <value>${queue_name}</value>
                 </property>
+                <!--Let hive decide on the number of reducers -->
+                <property>
+                    <name>mapred.reduce.tasks</name>
+                    <value>-1</value>
+                </property>
                 <property>
                     <name>hive.exec.scratchdir</name>
                     <value>/tmp/hive-${user}</value>
diff --git a/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql 
b/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
index 94a606b..4dad91d 100644
--- a/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
+++ b/oozie/last_access_uniques/monthly/last_access_uniques_monthly.hql
@@ -86,7 +86,7 @@
         -- Other cases, don't
         ELSE 0
     END) AS uniques_underestimate,
-    oenca.uniques_offset AS uniques_offset,
+    fresh.uniques_offset AS uniques_offset,
     SUM(CASE
         -- Last access not set and client accept cookies --> first visit, count
         WHEN (la.last_access IS NULL AND la.nocookies is NULL) THEN 1
@@ -95,21 +95,20 @@
             AND (la.last_access < unix_timestamp(CONCAT('${year}-', 
LPAD('${month}', 2, '0'), '-01'), 'yyyy-MM-dd'))) THEN 1
         -- Other cases, don't
         ELSE 0
-    END) + oenca.uniques_offset AS uniques_estimate
+    END) + fresh.uniques_offset AS uniques_estimate
 FROM
     last_access_dates AS la
-    INNER JOIN fresh_sessions_aggregated AS oenca
-        ON (oenca.uri_host = la.uri_host
-            AND oenca.country_code = la.country_code)
+    INNER JOIN fresh_sessions_aggregated AS fresh
+        ON (fresh.uri_host = la.uri_host
+            AND fresh.country_code = la.country_code)
 GROUP BY
     la.uri_host,
     la.country,
     la.country_code,
-    oenca.uniques_offset
+    fresh.uniques_offset
 -- TODO
 -- Add HAVING clause to restrict on long tail (maybe ?)
-ORDER BY
-    uniques_estimate DESC
+--
 -- Limit enforced by hive strict mapreduce setting.
 -- 1000000000 == NO LIMIT !
 LIMIT 1000000000;
diff --git a/oozie/last_access_uniques/monthly/workflow.xml 
b/oozie/last_access_uniques/monthly/workflow.xml
index 0de7763..af141cc 100644
--- a/oozie/last_access_uniques/monthly/workflow.xml
+++ b/oozie/last_access_uniques/monthly/workflow.xml
@@ -68,6 +68,11 @@
                     <name>mapreduce.job.queuename</name>
                     <value>${queue_name}</value>
                 </property>
+                <!--Let hive decide on the number of reducers -->
+                <property>
+                    <name>mapred.reduce.tasks</name>
+                    <value>-1</value>
+                </property>
                 <property>
                     <name>hive.exec.scratchdir</name>
                     <value>/tmp/hive-${user}</value>

-- 
To view, visit https://gerrit.wikimedia.org/r/271550
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ife61a7bf02478b1d367af6aa19c7df3c89eb63f9
Gerrit-PatchSet: 2
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Madhuvishy <[email protected]>
Gerrit-Reviewer: Mforns <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to