Milimetric has uploaded a new change for review. https://gerrit.wikimedia.org/r/246149
Change subject: [WIP] Archive hourly pageviews by article in wsc format ...................................................................... [WIP] Archive hourly pageviews by article in wsc format TODO: test on the cluster Webstatscollector format is used by all other page statistics dumps, so this change outputs pageviews in the new definition with normalized article titles, but in this common format. Bug: T114379 Change-Id: I98cbd753433de7dbf9fc3dfde3e97c79ddca31d3 --- M oozie/pageview/hourly/README.md M oozie/pageview/hourly/coordinator.properties A oozie/pageview/hourly/transform_pageview_to_wsc_format.hql M oozie/pageview/hourly/workflow.xml R oozie/projectview/hourly/transform_projectview_to_wsc_format.hql M oozie/projectview/hourly/workflow.xml 6 files changed, 195 insertions(+), 10 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery refs/changes/49/246149/1 diff --git a/oozie/pageview/hourly/README.md b/oozie/pageview/hourly/README.md index 0b21c9a..f18ce3f 100644 --- a/oozie/pageview/hourly/README.md +++ b/oozie/pageview/hourly/README.md @@ -1,11 +1,14 @@ # Aggregation phase for pageview from webrequest +# and archive into webstatcollector format. -This job is responsible for filtering pageview from -refined webrequest table, then aggregating them into -interesting dimensions. +This job is responsible for filtering pageview data +from the refined webrequest table, aggregating it into +interesting dimensions, and finally transforming and +archiving it into webstatcollector format. Output is appended into (year, month, day, hour) partitions -in /wmf/data/wmf/pageview/hourly +in /wmf/data/wmf/pageview/hourly, and then archived into +/wmf/data/archive/pageview/webstatcollector/hourly # Outline diff --git a/oozie/pageview/hourly/coordinator.properties b/oozie/pageview/hourly/coordinator.properties index 805b295..8080e36 100644 --- a/oozie/pageview/hourly/coordinator.properties +++ b/oozie/pageview/hourly/coordinator.properties @@ -1,5 +1,6 @@ # Configures a coordinator to manage automatically aggregating pageview from -# the refined webrequest table. Any of the following properties are overidable with -D. +# the refined webrequest table, transforming to webstatscollector format, and archiving. +# Any of the following properties are overidable with -D. # Usage: # oozie job -Duser=$USER -Dstart_time=2015-01-05T00:00Z -submit -config oozie/pageview/hourly/coordinator.properties # @@ -35,8 +36,8 @@ webrequest_data_directory = ${name_node}/wmf/data/wmf/webrequest # HDFS path to pageview dataset definitions -pageview_datasets_file = ${oozie_directory}/pageview/datasets.xml -pageview_data_directory = ${name_node}/wmf/data/wmf/pageview +pageview_datasets_file = ${oozie_directory}/pageview/datasets.xml +pageview_data_directory = ${name_node}/wmf/data/wmf/pageview # Initial import time of the webrequest dataset. start_time = 2015-05-01T00:00Z @@ -46,6 +47,8 @@ # HDFS path to workflow to mark a directory as done mark_directory_done_workflow_file = ${oozie_directory}/util/mark_directory_done/workflow.xml +# HDFS path to workflow to archive output. +archive_job_output_workflow_file = ${oozie_directory}/util/archive_job_output/workflow.xml # HDFS path to hive-site.xml file. This is needed to run hive actions. hive_site_xml = ${oozie_directory}/util/hive/hive-site.xml @@ -57,7 +60,17 @@ # Record version to keep track of changes record_version = 0.0.3 + +# Temporary directory for archiving +temporary_directory = ${name_node}/tmp + +# Archive base directory +archive_directory = ${name_node}/wmf/data/archive + +# Archive directory for pageview_hourly_webstatcollector_format +pageview_archive_directory = ${archive_directory}/pageview/webstatcollector/hourly + # Coordintator to start. -oozie.coord.application.path = ${coordinator_file} +oozie.coord.application.path = ${coordinator_file} oozie.use.system.libpath = true oozie.action.external.stats.write = true diff --git a/oozie/pageview/hourly/transform_pageview_to_wsc_format.hql b/oozie/pageview/hourly/transform_pageview_to_wsc_format.hql new file mode 100644 index 0000000..3838f11 --- /dev/null +++ b/oozie/pageview/hourly/transform_pageview_to_wsc_format.hql @@ -0,0 +1,83 @@ +-- Parameters: +-- source_table -- Fully qualified table name to compute the +-- transformation from. +-- destination_directory -- Directory where to write transformation +-- results +-- year -- year of partition to compute statistics for. +-- month -- month of partition to compute statistics for. +-- day -- day of partition to compute statistics for. +-- hour -- hour of partition to compute statistics for. +-- +-- Usage: +-- hive -f transform_pageview_to_wsc_format.hql \ +-- -d source_table=wmf.pageview_hourly \ +-- -d destination_directory=/tmp/example \ +-- -d year=2015 \ +-- -d month=6 \ +-- -d day=1 \ +-- -d hour=1 +-- + +SET hive.exec.compress.output =false; +SET mapred.reduce.tasks = 1; + + +INSERT OVERWRITE DIRECTORY "${destination_directory}" + -- Since "ROW FORMAT DELIMITED DELIMITED FIELDS TERMINATED BY ' '" only + -- works for exports to local directories (see HIVE-5672), we have to + -- prepare the lines by hand through concatenation :-( + -- Set 0 as volume column since we don't use it. + SELECT + CONCAT_WS(" ", qualifier, page_title, cast(view_count AS string), "0") line + FROM ( + SELECT + CONCAT( + -- Core identifier and mobile/zero + CASE regexp_extract(project, '^([A-Za-z0-9-]+)\\.[a-z]*$') + WHEN '' THEN ( + --zero/mobile if any, www otherwise + CASE + WHEN COALESCE(zero_carrier, '') <> '' THEN 'zero' + WHEN COALESCE(access_method, '') IN ('mobile web', 'mobile app') THEN 'm' + ELSE 'www' + END + ) + ELSE ( + -- Project ident plus zero/mobile suffix if any + CASE + WHEN COALESCE(zero_carrier, '') <> '' + THEN CONCAT(regexp_extract(project, '^([A-Za-z0-9-]+)\\.[a-z]*$'), '.zero') + WHEN COALESCE(access_method, '') IN ('mobile web', 'mobile app') + THEN CONCAT(regexp_extract(project, '^([A-Za-z0-9-]+)\\.[a-z]*$'), '.m') + ELSE regexp_extract(project, '^([A-Za-z0-9-]+)\\.[a-z]*$') + END + ) + END, + -- Project suffix, made NULL if not found + CASE regexp_extract(project, '^([A-Za-z0-9-]+\\.)?(wik(ipedia|ibooks|tionary|imediafoundation|imedia|inews|iquote|isource|iversity|ivoyage|idata)|mediawiki)$', 2) + WHEN 'wikipedia' THEN '' + WHEN 'wikibooks' THEN '.b' + WHEN 'wiktionary' THEN '.d' + WHEN 'wikimediafoundation' THEN '.f' + WHEN 'wikimedia' THEN '.m' + WHEN 'wikinews' THEN '.n' + WHEN 'wikiquote' THEN '.q' + WHEN 'wikisource' THEN '.s' + WHEN 'wikiversity' THEN '.v' + WHEN 'wikivoyage' THEN '.voy' + WHEN 'mediawiki' THEN '.w' + WHEN 'wikidata' THEN '.wd' + ELSE NULL + END + ) qualifier, + view_count + FROM ${source_table} + WHERE year=${year} + AND month=${month} + AND day=${day} + AND hour=${hour} + AND agent_type = 'user' + ) pageview_transformed + ORDER BY line + LIMIT 100000000 +; diff --git a/oozie/pageview/hourly/workflow.xml b/oozie/pageview/hourly/workflow.xml index 8850abf..21eacc7 100644 --- a/oozie/pageview/hourly/workflow.xml +++ b/oozie/pageview/hourly/workflow.xml @@ -3,6 +3,7 @@ name="pageview-hourly-${source_table}->${destination_table}-${year},${month},${day},${hour}-wf"> <parameters> + <!-- Default values for inner oozie settings --> <property> <name>oozie_launcher_queue_name</name> @@ -116,6 +117,88 @@ </property> </configuration> </sub-workflow> + <ok to="transform"/> + <error to="kill"/> + </action> + + <action name="transform"> + <hive xmlns="uri:oozie:hive-action:0.2"> + <job-tracker>${job_tracker}</job-tracker> + <name-node>${name_node}</name-node> + <job-xml>${hive_site_xml}</job-xml> + <configuration> + <property> + <name>mapreduce.job.queuename</name> + <value>${queue_name}</value> + </property> + <!--make sure oozie:launcher runs in a low priority queue --> + <property> + <name>oozie.launcher.mapred.job.queue.name</name> + <value>${oozie_launcher_queue_name}</value> + </property> + <property> + <name>oozie.launcher.mapreduce.map.memory.mb</name> + <value>${oozie_launcher_memory}</value> + </property> + <property> + <name>hive.exec.scratchdir</name> + <value>/tmp/hive-${user}</value> + </property> + </configuration> + + <script>${hive_script_transform}</script> + <!-- Here, the source for archive is the + destination of the previous job --> + <param>source_table=${destination_table}</param> + <param>year=${year}</param> + <param>month=${month}</param> + <param>day=${day}</param> + <param>hour=${hour}</param> + <param>destination_directory=${temporary_directory}/${wf:id()}</param> + </hive> + + <ok to="mark_transformed_pageview_dataset_done"/> + <error to="kill"/> + </action> + + <action name="mark_transformed_pageview_dataset_done"> + <sub-workflow> + <app-path>${mark_directory_done_workflow_file}</app-path> + <configuration> + <property> + <name>directory</name> + <value>${temporary_directory}/${wf:id()}</value> + </property> + </configuration> + </sub-workflow> + <ok to="move_data_to_archive"/> + <error to="kill"/> + </action> + + <action name="move_data_to_archive"> + <sub-workflow> + <app-path>${archive_job_output_workflow_file}</app-path> + <propagate-configuration/> + <configuration> + <property> + <name>source_directory</name> + <value>${temporary_directory}/${wf:id()}</value> + </property> + <property> + <name>expected_filename_ending</name> + <value>EMPTY</value> + </property> + <property> + <name>archive_file</name> + <!-- + webstatscollector used the end of the collection period as + timestamp in the filename. To not break scripts of people, + we also name files that way. + --> + <value>${pageview_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/pageviews-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000</value> + </property> + </configuration> + </sub-workflow> <ok to="end"/> <error to="kill"/> </action> diff --git a/oozie/projectview/hourly/transform_projectview_to_projectcounts.hql b/oozie/projectview/hourly/transform_projectview_to_wsc_format.hql similarity index 98% rename from oozie/projectview/hourly/transform_projectview_to_projectcounts.hql rename to oozie/projectview/hourly/transform_projectview_to_wsc_format.hql index 7cce87d..aa6b47d 100644 --- a/oozie/projectview/hourly/transform_projectview_to_projectcounts.hql +++ b/oozie/projectview/hourly/transform_projectview_to_wsc_format.hql @@ -9,7 +9,7 @@ -- hour -- hour of partition to compute statistics for. -- -- Usage: --- hive -f transform_projectview_to_projectcounts.hql \ +-- hive -f transform_projectview_to_wsc_format.hql \ -- -d source_table=wmf.projectview_hourly \ -- -d destination_directory=/tmp/example \ -- -d year=2015 \ @@ -120,4 +120,4 @@ ) projectview_transformed ORDER BY line LIMIT 100000 -; \ No newline at end of file +; diff --git a/oozie/projectview/hourly/workflow.xml b/oozie/projectview/hourly/workflow.xml index 2d8ac0a..e0a3b22 100644 --- a/oozie/projectview/hourly/workflow.xml +++ b/oozie/projectview/hourly/workflow.xml @@ -233,6 +233,9 @@ timestamp in the filename. To not break scripts of people, we also name files that way. --> + <!-- TODO: projectcounts should be renamed to projectviews here, + we need to coordinate changing this job with renaming the existing archives + and do this before publishing --> <value>${projectview_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/projectcounts-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000</value> </property> </configuration> -- To view, visit https://gerrit.wikimedia.org/r/246149 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I98cbd753433de7dbf9fc3dfde3e97c79ddca31d3 Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery Gerrit-Branch: master Gerrit-Owner: Milimetric <dandree...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits