[MediaWiki-commits] [Gerrit] [WIP] Archive hourly pageviews by article in wsc format - change (analytics/refinery)

Milimetric (Code Review) Tue, 13 Oct 2015 20:40:52 -0700

Milimetric has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/246149


Change subject: [WIP] Archive hourly pageviews by article in wsc format
......................................................................

[WIP] Archive hourly pageviews by article in wsc format

TODO: test on the cluster

Webstatscollector format is used by all other page statistics dumps, so
this change outputs pageviews in the new definition with normalized
article titles, but in this common format.

Bug: T114379
Change-Id: I98cbd753433de7dbf9fc3dfde3e97c79ddca31d3
---
M oozie/pageview/hourly/README.md
M oozie/pageview/hourly/coordinator.properties
A oozie/pageview/hourly/transform_pageview_to_wsc_format.hql
M oozie/pageview/hourly/workflow.xml
R oozie/projectview/hourly/transform_projectview_to_wsc_format.hql
M oozie/projectview/hourly/workflow.xml
6 files changed, 195 insertions(+), 10 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/49/246149/1

diff --git a/oozie/pageview/hourly/README.md b/oozie/pageview/hourly/README.md
index 0b21c9a..f18ce3f 100644
--- a/oozie/pageview/hourly/README.md
+++ b/oozie/pageview/hourly/README.md
@@ -1,11 +1,14 @@
 # Aggregation phase for pageview from webrequest
+# and archive into webstatcollector format.
 
-This job is responsible for filtering pageview from
-refined webrequest table, then aggregating them into
-interesting dimensions.
+This job is responsible for filtering pageview data
+from the refined webrequest table, aggregating it into
+interesting dimensions, and finally transforming and
+archiving it into webstatcollector format.
 
 Output is appended into (year, month, day, hour) partitions
-in /wmf/data/wmf/pageview/hourly
+in /wmf/data/wmf/pageview/hourly, and then archived into
+/wmf/data/archive/pageview/webstatcollector/hourly
 
 # Outline
 
diff --git a/oozie/pageview/hourly/coordinator.properties 
b/oozie/pageview/hourly/coordinator.properties
index 805b295..8080e36 100644
--- a/oozie/pageview/hourly/coordinator.properties
+++ b/oozie/pageview/hourly/coordinator.properties
@@ -1,5 +1,6 @@
 # Configures a coordinator to manage automatically aggregating pageview from
-# the refined webrequest table. Any of the following properties are overidable 
with -D.
+# the refined webrequest table, transforming to webstatscollector format, and 
archiving.
+# Any of the following properties are overidable with -D.
 # Usage:
 #   oozie job -Duser=$USER -Dstart_time=2015-01-05T00:00Z -submit -config 
oozie/pageview/hourly/coordinator.properties
 #
@@ -35,8 +36,8 @@
 webrequest_data_directory         = ${name_node}/wmf/data/wmf/webrequest
 
 # HDFS path to pageview dataset definitions
-pageview_datasets_file          = ${oozie_directory}/pageview/datasets.xml
-pageview_data_directory         = ${name_node}/wmf/data/wmf/pageview
+pageview_datasets_file            = ${oozie_directory}/pageview/datasets.xml
+pageview_data_directory           = ${name_node}/wmf/data/wmf/pageview
 
 # Initial import time of the webrequest dataset.
 start_time                        = 2015-05-01T00:00Z
@@ -46,6 +47,8 @@
 
 # HDFS path to workflow to mark a directory as done
 mark_directory_done_workflow_file = 
${oozie_directory}/util/mark_directory_done/workflow.xml
+# HDFS path to workflow to archive output.
+archive_job_output_workflow_file  = 
${oozie_directory}/util/archive_job_output/workflow.xml
 
 # HDFS path to hive-site.xml file.  This is needed to run hive actions.
 hive_site_xml                     = ${oozie_directory}/util/hive/hive-site.xml
@@ -57,7 +60,17 @@
 # Record version to keep track of changes
 record_version                    = 0.0.3
 
+
+# Temporary directory for archiving
+temporary_directory               = ${name_node}/tmp
+
+# Archive base directory
+archive_directory                 = ${name_node}/wmf/data/archive
+
+# Archive directory for pageview_hourly_webstatcollector_format
+pageview_archive_directory        = 
${archive_directory}/pageview/webstatcollector/hourly
+
 # Coordintator to start.
-oozie.coord.application.path     = ${coordinator_file}
+oozie.coord.application.path      = ${coordinator_file}
 oozie.use.system.libpath          = true
 oozie.action.external.stats.write = true
diff --git a/oozie/pageview/hourly/transform_pageview_to_wsc_format.hql 
b/oozie/pageview/hourly/transform_pageview_to_wsc_format.hql
new file mode 100644
index 0000000..3838f11
--- /dev/null
+++ b/oozie/pageview/hourly/transform_pageview_to_wsc_format.hql
@@ -0,0 +1,83 @@
+-- Parameters:
+--     source_table      -- Fully qualified table name to compute the
+--                          transformation from.
+--     destination_directory -- Directory where to write transformation
+--                              results
+--     year              -- year of partition to compute statistics for.
+--     month             -- month of partition to compute statistics for.
+--     day               -- day of partition to compute statistics for.
+--     hour              -- hour of partition to compute statistics for.
+--
+-- Usage:
+--     hive -f transform_pageview_to_wsc_format.hql               \
+--         -d source_table=wmf.pageview_hourly                    \
+--         -d destination_directory=/tmp/example                  \
+--         -d year=2015                                           \
+--         -d month=6                                             \
+--         -d day=1                                               \
+--         -d hour=1
+--
+
+SET hive.exec.compress.output        =false;
+SET mapred.reduce.tasks              = 1;
+
+
+INSERT OVERWRITE DIRECTORY "${destination_directory}"
+    -- Since "ROW FORMAT DELIMITED DELIMITED FIELDS TERMINATED BY ' '" only
+    -- works for exports to local directories (see HIVE-5672), we have to
+    -- prepare the lines by hand through concatenation :-(
+    -- Set 0 as volume column since we don't use it.
+    SELECT
+        CONCAT_WS(" ", qualifier, page_title, cast(view_count AS string), "0") 
line
+    FROM (
+        SELECT
+            CONCAT(
+                -- Core identifier and mobile/zero
+                CASE regexp_extract(project, '^([A-Za-z0-9-]+)\\.[a-z]*$')
+                    WHEN '' THEN (
+                        --zero/mobile if any, www otherwise
+                        CASE
+                            WHEN COALESCE(zero_carrier, '') <> '' THEN 'zero'
+                            WHEN COALESCE(access_method, '') IN ('mobile web', 
'mobile app') THEN 'm'
+                            ELSE 'www'
+                        END
+                    )
+                    ELSE (
+                        -- Project ident plus zero/mobile suffix if any
+                        CASE
+                            WHEN COALESCE(zero_carrier, '') <> ''
+                                THEN CONCAT(regexp_extract(project, 
'^([A-Za-z0-9-]+)\\.[a-z]*$'), '.zero')
+                            WHEN COALESCE(access_method, '') IN ('mobile web', 
'mobile app')
+                                THEN CONCAT(regexp_extract(project, 
'^([A-Za-z0-9-]+)\\.[a-z]*$'), '.m')
+                            ELSE regexp_extract(project, 
'^([A-Za-z0-9-]+)\\.[a-z]*$')
+                        END
+                    )
+                END,
+                -- Project suffix, made NULL if not found
+                CASE regexp_extract(project, 
'^([A-Za-z0-9-]+\\.)?(wik(ipedia|ibooks|tionary|imediafoundation|imedia|inews|iquote|isource|iversity|ivoyage|idata)|mediawiki)$',
 2)
+                    WHEN 'wikipedia' THEN ''
+                    WHEN 'wikibooks' THEN '.b'
+                    WHEN 'wiktionary' THEN '.d'
+                    WHEN 'wikimediafoundation' THEN '.f'
+                    WHEN 'wikimedia' THEN '.m'
+                    WHEN 'wikinews' THEN '.n'
+                    WHEN 'wikiquote' THEN '.q'
+                    WHEN 'wikisource' THEN '.s'
+                    WHEN 'wikiversity' THEN '.v'
+                    WHEN 'wikivoyage' THEN '.voy'
+                    WHEN 'mediawiki' THEN '.w'
+                    WHEN 'wikidata' THEN '.wd'
+                    ELSE NULL
+                END
+            ) qualifier,
+            view_count
+        FROM ${source_table}
+        WHERE year=${year}
+            AND month=${month}
+            AND day=${day}
+            AND hour=${hour}
+            AND agent_type = 'user'
+    ) pageview_transformed
+    ORDER BY line
+    LIMIT 100000000
+;
diff --git a/oozie/pageview/hourly/workflow.xml 
b/oozie/pageview/hourly/workflow.xml
index 8850abf..21eacc7 100644
--- a/oozie/pageview/hourly/workflow.xml
+++ b/oozie/pageview/hourly/workflow.xml
@@ -3,6 +3,7 @@
     
name="pageview-hourly-${source_table}->${destination_table}-${year},${month},${day},${hour}-wf">
 
     <parameters>
+
         <!-- Default values for inner oozie settings -->
         <property>
             <name>oozie_launcher_queue_name</name>
@@ -116,6 +117,88 @@
                 </property>
             </configuration>
         </sub-workflow>
+        <ok to="transform"/>
+        <error to="kill"/>
+    </action>
+
+    <action name="transform">
+        <hive xmlns="uri:oozie:hive-action:0.2">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>hive.exec.scratchdir</name>
+                    <value>/tmp/hive-${user}</value>
+                </property>
+            </configuration>
+
+            <script>${hive_script_transform}</script>
+            <!-- Here, the source for archive is the
+                 destination of the previous job -->
+            <param>source_table=${destination_table}</param>
+            <param>year=${year}</param>
+            <param>month=${month}</param>
+            <param>day=${day}</param>
+            <param>hour=${hour}</param>
+            
<param>destination_directory=${temporary_directory}/${wf:id()}</param>
+        </hive>
+
+        <ok to="mark_transformed_pageview_dataset_done"/>
+        <error to="kill"/>
+    </action>
+
+    <action name="mark_transformed_pageview_dataset_done">
+        <sub-workflow>
+            <app-path>${mark_directory_done_workflow_file}</app-path>
+            <configuration>
+                <property>
+                    <name>directory</name>
+                    <value>${temporary_directory}/${wf:id()}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="move_data_to_archive"/>
+        <error to="kill"/>
+    </action>
+
+    <action name="move_data_to_archive">
+        <sub-workflow>
+            <app-path>${archive_job_output_workflow_file}</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>source_directory</name>
+                    <value>${temporary_directory}/${wf:id()}</value>
+                </property>
+                <property>
+                    <name>expected_filename_ending</name>
+                    <value>EMPTY</value>
+                </property>
+                <property>
+                    <name>archive_file</name>
+                    <!--
+                    webstatscollector used the end of the collection period as
+                    timestamp in the filename. To not break scripts of people,
+                    we also name files that way.
+                    -->
+                    
<value>${pageview_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/pageviews-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000</value>
+                </property>
+            </configuration>
+        </sub-workflow>
         <ok to="end"/>
         <error to="kill"/>
     </action>
diff --git 
a/oozie/projectview/hourly/transform_projectview_to_projectcounts.hql 
b/oozie/projectview/hourly/transform_projectview_to_wsc_format.hql
similarity index 98%
rename from oozie/projectview/hourly/transform_projectview_to_projectcounts.hql
rename to oozie/projectview/hourly/transform_projectview_to_wsc_format.hql
index 7cce87d..aa6b47d 100644
--- a/oozie/projectview/hourly/transform_projectview_to_projectcounts.hql
+++ b/oozie/projectview/hourly/transform_projectview_to_wsc_format.hql
@@ -9,7 +9,7 @@
 --     hour              -- hour of partition to compute statistics for.
 --
 -- Usage:
---     hive -f transform_projectview_to_projectcounts.hql         \
+--     hive -f transform_projectview_to_wsc_format.hql            \
 --         -d source_table=wmf.projectview_hourly                 \
 --         -d destination_directory=/tmp/example                  \
 --         -d year=2015                                           \
@@ -120,4 +120,4 @@
     ) projectview_transformed
     ORDER BY line
     LIMIT 100000
-;
\ No newline at end of file
+;
diff --git a/oozie/projectview/hourly/workflow.xml 
b/oozie/projectview/hourly/workflow.xml
index 2d8ac0a..e0a3b22 100644
--- a/oozie/projectview/hourly/workflow.xml
+++ b/oozie/projectview/hourly/workflow.xml
@@ -233,6 +233,9 @@
                     timestamp in the filename. To not break scripts of people,
                     we also name files that way.
                     -->
+                    <!-- TODO: projectcounts should be renamed to projectviews 
here,
+                         we need to coordinate changing this job with renaming 
the existing archives
+                         and do this before publishing -->
                     
<value>${projectview_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/projectcounts-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000</value>
                 </property>
             </configuration>

-- 
To view, visit https://gerrit.wikimedia.org/r/246149
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I98cbd753433de7dbf9fc3dfde3e97c79ddca31d3
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Milimetric <dandree...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] [WIP] Archive hourly pageviews by article in wsc format - change (analytics/refinery)

Reply via email to