Joal has uploaded a new change for review. https://gerrit.wikimedia.org/r/240099
Change subject: Add pageview quality check to pageview_hourly ...................................................................... Add pageview quality check to pageview_hourly Add pageview_whitelist and pageview_unexpected_values hive tables Add whitelist example TSV file in hive expected folder Update pageview_hourly job to fill unexpected_values table from whitelist check and send email if not empty Bug: T109739 Change-Id: I3bf4f6c42c04843d33df10794f29c54075644156 --- A hive/pageview/hourly/create_pageview_unexpected_values_table.hql A hive/pageview/hourly/create_pageview_whitelist_table.hql M oozie/pageview/datasets.xml M oozie/pageview/hourly/coordinator.properties M oozie/pageview/hourly/coordinator.xml A oozie/pageview/hourly/pageview_whitelist_check.hql M oozie/pageview/hourly/workflow.xml A static_data/pageview/whitelist/whitelist.tsv 8 files changed, 341 insertions(+), 20 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery refs/changes/99/240099/1 diff --git a/hive/pageview/hourly/create_pageview_unexpected_values_table.hql b/hive/pageview/hourly/create_pageview_unexpected_values_table.hql new file mode 100644 index 0000000..0fc49a6 --- /dev/null +++ b/hive/pageview/hourly/create_pageview_unexpected_values_table.hql @@ -0,0 +1,33 @@ +-- Creates table statement for pageview unexpected values table. +-- +-- NOTE: When choosing partition field types, +-- one should take into consideration Hive's +-- insistence on storing partition values +-- as strings. See: +-- https://wikitech.wikimedia.org/wiki/File:Hive_partition_formats.png +-- and +-- http://bots.wmflabs.org/~wm-bot/logs/%23wikimedia-analytics/20140721.txt +-- +-- Parameters: +-- <none> +-- +-- Usage +-- hive -f create_pageview_unexpected_table.hql --database wmf +-- + +CREATE EXTERNAL TABLE IF NOT EXISTS `pageview_unexpected_values`( + `field_name` string COMMENT 'Name of the field with unexpected value', + `unexpected_value` string COMMENT 'Value not in the whitel-ist for the given field name', + `view_count` bigint COMMENT 'Number of views of the unexpected value' +) +PARTITIONED BY ( + `year` int COMMENT 'Unpadded year of request', + `month` int COMMENT 'Unpadded month of request', + `day` int COMMENT 'Unpadded day of request', + `hour` int COMMENT 'Unpadded hour of request' +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '\t' +STORED AS TEXTFILE +LOCATION '/wmf/data/wmf/pageview/unexpected_values' +; \ No newline at end of file diff --git a/hive/pageview/hourly/create_pageview_whitelist_table.hql b/hive/pageview/hourly/create_pageview_whitelist_table.hql new file mode 100644 index 0000000..2e626fd --- /dev/null +++ b/hive/pageview/hourly/create_pageview_whitelist_table.hql @@ -0,0 +1,21 @@ +-- Creates table statement for pageview whitelist table. +-- +-- Parameters: +-- <none> +-- +-- Usage +-- hive -f create_pageview_whitelist_table.hql --database wmf +-- + + + +CREATE EXTERNAL TABLE IF NOT EXISTS `pageview_whitelist` ( + `field_name` string COMMENT 'Name of the field with a white-listed value', + `authorized_value` string COMMENT 'Value authorized for the given field name', + `insertion_ts` timestamp COMMENT 'Date of insertion into the white-list', +) +ROW FORMAT DELIMITED +FIELDS TERMINATED BY '\t' +STORED AS TEXTFILE +-- Set table location relative to the current refinery folder +LOCATION '/wmf/refinery/current/static_data/pageview/whitelist'; \ No newline at end of file diff --git a/oozie/pageview/datasets.xml b/oozie/pageview/datasets.xml index 87b16c4..217f13d 100644 --- a/oozie/pageview/datasets.xml +++ b/oozie/pageview/datasets.xml @@ -30,4 +30,11 @@ <uri-template>${pageview_data_directory}/hourly/year=${YEAR}/month=${"$"}{MONTH + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template> <done-flag>_SUCCESS</done-flag> </dataset> + <dataset name="pageview_unexpected_values" + frequency="${coord:hours(1)}" + initial-instance="${start_time}" + timezone="Universal"> + <uri-template>${pageview_data_directory}/unexpected_values/year=${YEAR}/month=${"$"}{MONTH + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template> + <done-flag>_SUCCESS</done-flag> + </dataset> </datasets> diff --git a/oozie/pageview/hourly/coordinator.properties b/oozie/pageview/hourly/coordinator.properties index 805b295..e6cacec 100644 --- a/oozie/pageview/hourly/coordinator.properties +++ b/oozie/pageview/hourly/coordinator.properties @@ -47,12 +47,19 @@ # HDFS path to workflow to mark a directory as done mark_directory_done_workflow_file = ${oozie_directory}/util/mark_directory_done/workflow.xml +# Workflow to send an error email +send_error_email_workflow_file = ${oozie_directory}/util/send_error_email/workflow.xml + # HDFS path to hive-site.xml file. This is needed to run hive actions. hive_site_xml = ${oozie_directory}/util/hive/hive-site.xml # Fully qualified Hive table name. -source_table = wmf.webrequest -destination_table = wmf.pageview_hourly +webrequest_table = wmf.webrequest +pageview_table = wmf.pageview_hourly + +# Tables for whitelist check +whitelist_table = wmf.pageview_whitelist +unexpected_values_table = wmf.pageview_unexpected_values # Record version to keep track of changes record_version = 0.0.3 diff --git a/oozie/pageview/hourly/coordinator.xml b/oozie/pageview/hourly/coordinator.xml index 16619cd..caaa221 100644 --- a/oozie/pageview/hourly/coordinator.xml +++ b/oozie/pageview/hourly/coordinator.xml @@ -25,8 +25,11 @@ <property><name>pageview_data_directory</name></property> <property><name>hive_site_xml</name></property> - <property><name>source_table</name></property> - <property><name>destination_table</name></property> + <property><name>webrequest_table</name></property> + <property><name>pageview_table</name></property> + <property><name>whitelist_table</name></property> + <property><name>unexpected_values_table</name></property> + <property><name>send_error_email_workflow_file</name></property> </parameters> <controls> @@ -81,6 +84,9 @@ <data-out name="pageview_hourly_output" dataset="pageview_hourly"> <instance>${coord:current(0)}</instance> </data-out> + <data-out name="pageview_unexpected_values_output" dataset="pageview_unexpected_values"> + <instance>${coord:current(0)}</instance> + </data-out> </output-events> <action> @@ -104,10 +110,15 @@ <value>${coord:formatTime(coord:nominalTime(), "H")}</value> </property> <property> - <!-- To mark directory done after success --> - <name>destination_dataset_directory</name> + <!-- To mark directory done after pageview success --> + <name>pageview_dataset_directory</name> <value>${coord:dataOut('pageview_hourly_output')}</value> </property> + <property> + <!-- To mark directory done after whitelist check success --> + <name>unexpected_values_dataset_directory</name> + <value>${coord:dataOut('pageview_unexpected_values_output')}</value> + </property> </configuration> </workflow> diff --git a/oozie/pageview/hourly/pageview_whitelist_check.hql b/oozie/pageview/hourly/pageview_whitelist_check.hql new file mode 100644 index 0000000..5c82792 --- /dev/null +++ b/oozie/pageview/hourly/pageview_whitelist_check.hql @@ -0,0 +1,119 @@ +-- Parameters: +-- source_table -- Fully qualified table name to check against +-- the whitelist table +-- whitelist_table -- Fully qualified table name for the whitelist to be +-- used as filter +-- destination_table -- Fully qualified table name to fill in the values not +-- in the whitelist +-- year -- year of partition to whitelist-check. +-- month -- month of partition to whitelist-check +-- day -- day of partition to whitelist-check +-- hour -- hour of partition to whitelist-check +-- +-- Usage: +-- hive -f pageview_whitelist_check.hql \ +-- -d source_table=wmf.pageview_hourly \ +-- -d whitelist_table=wmf.pageview_whitelist \ +-- -d destination_table=wmf.pageview_unexpected_values \ +-- -d year=2015 \ +-- -d month=5 \ +-- -d day=1 \ +-- -d hour=1 +-- + +-- Restrict to needed fields for the given hour +WITH pageview_hour AS ( + SELECT + project, + continent, + country, + country_code, + access_method, + agent_type, + SUM(view_count) as view_count + FROM ${source_table} + WHERE year= ${year} + AND month = ${month} + AND day = ${day} + AND hour = ${hour} + GROUP BY + project, + continent, + country, + country_code, + access_method, + agent_type, +), + +-- Get distinct values with field names using previously aggregated data +distinct_values AS ( + + -- distinct project + SELECT + 'project' AS field_name, + project as value, + SUM(view_count) as view_count + FROM pageview_hour + GROUP BY project + + -- distinct continent + SELECT + 'continent' AS field_name, + continent as value, + SUM(view_count) as view_count + FROM pageview_hour + GROUP BY continent + + -- distinct country + SELECT + 'country' AS field_name, + country as value, + SUM(view_count) as view_count + FROM pageview_hour + GROUP BY country + + -- distinct country_code + SELECT + 'country_code' AS field_name, + country_code as value, + SUM(view_count) as view_count + FROM pageview_hour + GROUP BY country_code + + -- distinct access_method + SELECT + 'access_method' AS field_name, + access_method as value, + SUM(view_count) as view_count + FROM pageview_hour + GROUP BY access_method + + UNION ALL + + -- distinct agent_type + SELECT + 'agent_type' AS field_name, + agent_type as value, + SUM(view_count) as view_count + FROM pageview_hour + GROUP BY agent_type + +) + +-- Write in destination table partition the data that didn't match the join +INSERT OVERWRITE TABLE ${destination_table} + PARTITION(year=${year},month=${month},day=${day},hour=${hour}) +SELECT + -- Since the whitelist is small enough to fit in RAM, strongly improve + -- efficiency using a Map-side join + /*+ MAPJOIN(${whitelist_table}) */ + distinct_values.field_name, + distinct_values.value, + distinct_values.view_count +FROM distinct_values + LEFT JOIN ${whitelist_table} AS whitelist ON ( + distinct_values.field_name = whitelist.field_name + AND distinct_values.value = whitelist.authorized_value) +WHERE + -- Keep only non-matching values + whitelist.field_name IS NULL; diff --git a/oozie/pageview/hourly/workflow.xml b/oozie/pageview/hourly/workflow.xml index 8850abf..a2aacc8 100644 --- a/oozie/pageview/hourly/workflow.xml +++ b/oozie/pageview/hourly/workflow.xml @@ -21,10 +21,16 @@ <!-- Aggregation related configuration properties--> <property> - <name>hive_script</name> + <name>hive_pageview_script</name> <!-- This is relative to the containing directory of this file. --> <value>pageview_hourly.hql</value> - <description>Hive script to run.</description> + <description>Hive pageview script to run.</description> + </property> + <property> + <name>hive_whitelist_check_script</name> + <!-- This is relative to the containing directory of this file. --> + <value>pageview_whitelist_check.hql</value> + <description>Hive pageview whitelist check script to run.</description> </property> <property> @@ -32,12 +38,20 @@ <description>hive-site.xml file path in HDFS</description> </property> <property> - <name>source_table</name> - <description>Hive table to refine</description> + <name>webrequest_table</name> + <description>Hive webrequest refined table</description> </property> <property> - <name>destination_table</name> - <description>The destinaton table to store refined data in.</description> + <name>pageview_table</name> + <description>Hive pageview table to store hourly data.</description> + </property> + <property> + <name>whitelist_table</name> + <description>Hive pageview whitelist table for check</description> + </property> + <property> + <name>unexpected_values_table</name> + <description>Hive pageview unexpected values from whitelist check</description> </property> <property> <name>year</name> @@ -60,8 +74,12 @@ <description>Workflow for marking a directory done</description> </property> <property> - <name>destination_dataset_directory</name> + <name>pageview_dataset_directory</name> <description>Directory to generate the done flag in</description> + </property> + <property> + <name>send_error_email_workflow_file</name> + <description>Workflow for sending an email</description> </property> </parameters> @@ -92,9 +110,9 @@ </property> </configuration> - <script>${hive_script}</script> - <param>source_table=${source_table}</param> - <param>destination_table=${destination_table}</param> + <script>${hive_pageview_script}</script> + <param>source_table=${webrequest_table}</param> + <param>destination_table=${pageview_table}</param> <param>record_version=${record_version}</param> <param>year=${year}</param> <param>month=${month}</param> @@ -102,24 +120,123 @@ <param>hour=${hour}</param> </hive> - <ok to="mark_dataset_done"/> - <error to="kill"/> + <ok to="mark_pageview_dataset_done"/> + <error to="send_error_email"/> </action> - <action name="mark_dataset_done"> + <action name="mark_pageview_dataset_done"> <sub-workflow> <app-path>${mark_directory_done_workflow_file}</app-path> <configuration> <property> <name>directory</name> - <value>${destination_dataset_directory}</value> + <value>${pageview_dataset_directory}</value> + </property> + </configuration> + </sub-workflow> + <ok to="whitelist_check"/> + <error to="send_error_email"/> + </action> + + <action name="whitelist_check"> + <hive xmlns="uri:oozie:hive-action:0.2"> + <job-tracker>${job_tracker}</job-tracker> + <name-node>${name_node}</name-node> + <job-xml>${hive_site_xml}</job-xml> + <configuration> + <property> + <name>mapreduce.job.queuename</name> + <value>${queue_name}</value> + </property> + <!--make sure oozie:launcher runs in a low priority queue --> + <property> + <name>oozie.launcher.mapred.job.queue.name</name> + <value>${oozie_launcher_queue_name}</value> + </property> + <property> + <name>oozie.launcher.mapreduce.map.memory.mb</name> + <value>${oozie_launcher_memory}</value> + </property> + <property> + <name>hive.exec.scratchdir</name> + <value>/tmp/hive-${user}</value> + </property> + </configuration> + + <script>${hive_whitelist_check_script}</script> + <param>source_table=${pageview_table}</param> + <param>whitelist_table=${whitelist_table}</param> + <param>destination_table=${unexpected_values_table}</param> + <param>year=${year}</param> + <param>month=${month}</param> + <param>day=${day}</param> + <param>hour=${hour}</param> + </hive> + + <ok to="mark_unexpected_values_dataset_done"/> + <error to="send_error_email"/> + </action> + + <action name="mark_unexpected_values_dataset_done"> + <sub-workflow> + <app-path>${mark_directory_done_workflow_file}</app-path> + <configuration> + <property> + <name>directory</name> + <value>${unexpected_values_dataset_directory}</value> + </property> + </configuration> + </sub-workflow> + <ok to="empty_result_check"/> + <error to="send_error_email"/> + </action> + + <decision name="empty_result_check"> + <switch> + <case to="send_whitelist_email"> + ${fs:dirSize(unexpected_values_dataset_directory) eq 0} + </case> + <default to="end"/> + </switch> + </decision> + + + <action name="send_whitelist_email"> + <sub-workflow> + <app-path>${send_error_email_workflow_file}</app-path> + <propagate-configuration/> + <configuration> + <property> + <name>parent_name</name> + <value>${wf:name()}</value> + </property> + <property> + <name>subject</name> + <value>Unexpected values in pageview for workflow - ${wf:name()}</value> </property> </configuration> </sub-workflow> <ok to="end"/> + <error to="send_error_email"/> + </action> + + <action name="send_error_email"> + <sub-workflow> + <app-path>${send_error_email_workflow_file}</app-path> + <propagate-configuration/> + <configuration> + <property> + <name>parent_name</name> + <value>${wf:name()}</value> + </property> + </configuration> + </sub-workflow> + <ok to="kill"/> <error to="kill"/> </action> + + <kill name="kill"> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> </kill> diff --git a/static_data/pageview/whitelist/whitelist.tsv b/static_data/pageview/whitelist/whitelist.tsv new file mode 100644 index 0000000..cc7bed2 --- /dev/null +++ b/static_data/pageview/whitelist/whitelist.tsv @@ -0,0 +1,6 @@ +agent_type user 2015-09-18 15:30:00 +agent_type spider 2015-09-18 15:30:00 +agent_type bot 2015-09-18 15:30:00 +access_method desktop 2015-09-18 15:30:00 +access_method mobile web 2015-09-18 15:30:00 +access_method mobile app 2015-09-18 15:30:00 \ No newline at end of file -- To view, visit https://gerrit.wikimedia.org/r/240099 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I3bf4f6c42c04843d33df10794f29c54075644156 Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery Gerrit-Branch: master Gerrit-Owner: Joal <j...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits