Joal has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/240099

Change subject: Add pageview quality check to pageview_hourly
......................................................................

Add pageview quality check to pageview_hourly

Add pageview_whitelist and pageview_unexpected_values hive tables
Add whitelist example TSV file in hive expected folder
Update pageview_hourly job to fill unexpected_values table from whitelist check 
and send email if not empty

Bug: T109739
Change-Id: I3bf4f6c42c04843d33df10794f29c54075644156
---
A hive/pageview/hourly/create_pageview_unexpected_values_table.hql
A hive/pageview/hourly/create_pageview_whitelist_table.hql
M oozie/pageview/datasets.xml
M oozie/pageview/hourly/coordinator.properties
M oozie/pageview/hourly/coordinator.xml
A oozie/pageview/hourly/pageview_whitelist_check.hql
M oozie/pageview/hourly/workflow.xml
A static_data/pageview/whitelist/whitelist.tsv
8 files changed, 341 insertions(+), 20 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/99/240099/1

diff --git a/hive/pageview/hourly/create_pageview_unexpected_values_table.hql 
b/hive/pageview/hourly/create_pageview_unexpected_values_table.hql
new file mode 100644
index 0000000..0fc49a6
--- /dev/null
+++ b/hive/pageview/hourly/create_pageview_unexpected_values_table.hql
@@ -0,0 +1,33 @@
+-- Creates table statement for pageview unexpected values table.
+--
+-- NOTE:  When choosing partition field types,
+-- one should take into consideration Hive's
+-- insistence on storing partition values
+-- as strings.  See:
+-- https://wikitech.wikimedia.org/wiki/File:Hive_partition_formats.png
+-- and
+-- http://bots.wmflabs.org/~wm-bot/logs/%23wikimedia-analytics/20140721.txt
+--
+-- Parameters:
+--     <none>
+--
+-- Usage
+--     hive -f create_pageview_unexpected_table.hql --database wmf
+--
+
+CREATE EXTERNAL TABLE IF NOT EXISTS `pageview_unexpected_values`(
+    `field_name`        string  COMMENT 'Name of the field with unexpected 
value',
+    `unexpected_value`  string  COMMENT 'Value not in the whitel-ist for the 
given field name',
+    `view_count`        bigint  COMMENT 'Number of views of the unexpected 
value'
+)
+PARTITIONED BY (
+    `year`              int     COMMENT 'Unpadded year of request',
+    `month`             int     COMMENT 'Unpadded month of request',
+    `day`               int     COMMENT 'Unpadded day of request',
+    `hour`              int     COMMENT 'Unpadded hour of request'
+)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+LOCATION '/wmf/data/wmf/pageview/unexpected_values'
+;
\ No newline at end of file
diff --git a/hive/pageview/hourly/create_pageview_whitelist_table.hql 
b/hive/pageview/hourly/create_pageview_whitelist_table.hql
new file mode 100644
index 0000000..2e626fd
--- /dev/null
+++ b/hive/pageview/hourly/create_pageview_whitelist_table.hql
@@ -0,0 +1,21 @@
+-- Creates table statement for pageview whitelist table.
+--
+-- Parameters:
+--     <none>
+--
+-- Usage
+--     hive -f create_pageview_whitelist_table.hql --database wmf
+--
+
+
+
+CREATE EXTERNAL TABLE IF NOT EXISTS `pageview_whitelist` (
+    `field_name`        string  COMMENT 'Name of the field with a white-listed 
value',
+    `authorized_value`  string  COMMENT 'Value authorized for the given field 
name',
+    `insertion_ts`      timestamp  COMMENT 'Date of insertion into the 
white-list',
+)
+ROW FORMAT DELIMITED
+FIELDS TERMINATED BY '\t'
+STORED AS TEXTFILE
+-- Set table location relative to the current refinery folder
+LOCATION '/wmf/refinery/current/static_data/pageview/whitelist';
\ No newline at end of file
diff --git a/oozie/pageview/datasets.xml b/oozie/pageview/datasets.xml
index 87b16c4..217f13d 100644
--- a/oozie/pageview/datasets.xml
+++ b/oozie/pageview/datasets.xml
@@ -30,4 +30,11 @@
         
<uri-template>${pageview_data_directory}/hourly/year=${YEAR}/month=${"$"}{MONTH 
+ 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
         <done-flag>_SUCCESS</done-flag>
     </dataset>
+    <dataset name="pageview_unexpected_values"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${pageview_data_directory}/unexpected_values/year=${YEAR}/month=${"$"}{MONTH
 + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
+        <done-flag>_SUCCESS</done-flag>
+    </dataset>
 </datasets>
diff --git a/oozie/pageview/hourly/coordinator.properties 
b/oozie/pageview/hourly/coordinator.properties
index 805b295..e6cacec 100644
--- a/oozie/pageview/hourly/coordinator.properties
+++ b/oozie/pageview/hourly/coordinator.properties
@@ -47,12 +47,19 @@
 # HDFS path to workflow to mark a directory as done
 mark_directory_done_workflow_file = 
${oozie_directory}/util/mark_directory_done/workflow.xml
 
+# Workflow to send an error email
+send_error_email_workflow_file    = 
${oozie_directory}/util/send_error_email/workflow.xml
+
 # HDFS path to hive-site.xml file.  This is needed to run hive actions.
 hive_site_xml                     = ${oozie_directory}/util/hive/hive-site.xml
 
 # Fully qualified Hive table name.
-source_table                      = wmf.webrequest
-destination_table                 = wmf.pageview_hourly
+webrequest_table                  = wmf.webrequest
+pageview_table                 = wmf.pageview_hourly
+
+# Tables for whitelist check
+whitelist_table                   = wmf.pageview_whitelist
+unexpected_values_table           = wmf.pageview_unexpected_values
 
 # Record version to keep track of changes
 record_version                    = 0.0.3
diff --git a/oozie/pageview/hourly/coordinator.xml 
b/oozie/pageview/hourly/coordinator.xml
index 16619cd..caaa221 100644
--- a/oozie/pageview/hourly/coordinator.xml
+++ b/oozie/pageview/hourly/coordinator.xml
@@ -25,8 +25,11 @@
         <property><name>pageview_data_directory</name></property>
 
         <property><name>hive_site_xml</name></property>
-        <property><name>source_table</name></property>
-        <property><name>destination_table</name></property>
+        <property><name>webrequest_table</name></property>
+        <property><name>pageview_table</name></property>
+        <property><name>whitelist_table</name></property>
+        <property><name>unexpected_values_table</name></property>
+        <property><name>send_error_email_workflow_file</name></property>
     </parameters>
 
     <controls>
@@ -81,6 +84,9 @@
         <data-out name="pageview_hourly_output" dataset="pageview_hourly">
             <instance>${coord:current(0)}</instance>
         </data-out>
+        <data-out name="pageview_unexpected_values_output" 
dataset="pageview_unexpected_values">
+            <instance>${coord:current(0)}</instance>
+        </data-out>
     </output-events>
 
     <action>
@@ -104,10 +110,15 @@
                     <value>${coord:formatTime(coord:nominalTime(), 
"H")}</value>
                 </property>
                 <property>
-                    <!-- To mark directory done after success -->
-                    <name>destination_dataset_directory</name>
+                    <!-- To mark directory done after pageview success -->
+                    <name>pageview_dataset_directory</name>
                     <value>${coord:dataOut('pageview_hourly_output')}</value>
                 </property>
+                <property>
+                    <!-- To mark directory done after whitelist check success 
-->
+                    <name>unexpected_values_dataset_directory</name>
+                    
<value>${coord:dataOut('pageview_unexpected_values_output')}</value>
+                </property>
 
             </configuration>
         </workflow>
diff --git a/oozie/pageview/hourly/pageview_whitelist_check.hql 
b/oozie/pageview/hourly/pageview_whitelist_check.hql
new file mode 100644
index 0000000..5c82792
--- /dev/null
+++ b/oozie/pageview/hourly/pageview_whitelist_check.hql
@@ -0,0 +1,119 @@
+-- Parameters:
+--     source_table      -- Fully qualified table name to check against
+--                          the whitelist table
+--     whitelist_table   -- Fully qualified table name for the whitelist to be
+--                          used as filter
+--     destination_table -- Fully qualified table name to fill in the values 
not
+--                          in the whitelist
+--     year              -- year of partition to whitelist-check.
+--     month             -- month of partition to whitelist-check
+--     day               -- day of partition to whitelist-check
+--     hour              -- hour of partition to whitelist-check
+--
+-- Usage:
+--     hive -f pageview_whitelist_check.hql                       \
+--         -d source_table=wmf.pageview_hourly                    \
+--         -d whitelist_table=wmf.pageview_whitelist              \
+--         -d destination_table=wmf.pageview_unexpected_values    \
+--         -d year=2015                                           \
+--         -d month=5                                             \
+--         -d day=1                                               \
+--         -d hour=1
+--
+
+-- Restrict to needed fields for the given hour
+WITH pageview_hour AS (
+    SELECT
+        project,
+        continent,
+        country,
+        country_code,
+        access_method,
+        agent_type,
+        SUM(view_count) as view_count
+    FROM ${source_table}
+    WHERE year= ${year}
+        AND month = ${month}
+        AND day = ${day}
+        AND hour = ${hour}
+    GROUP BY
+        project,
+        continent,
+        country,
+        country_code,
+        access_method,
+        agent_type,
+),
+
+-- Get distinct values with field names using previously aggregated data
+distinct_values AS (
+
+    -- distinct project
+    SELECT
+        'project' AS field_name,
+        project as value,
+        SUM(view_count) as view_count
+    FROM pageview_hour
+    GROUP BY project
+
+    -- distinct continent
+    SELECT
+        'continent' AS field_name,
+        continent as value,
+        SUM(view_count) as view_count
+    FROM pageview_hour
+    GROUP BY continent
+
+    -- distinct country
+    SELECT
+        'country' AS field_name,
+        country as value,
+        SUM(view_count) as view_count
+    FROM pageview_hour
+    GROUP BY country
+
+    -- distinct country_code
+    SELECT
+        'country_code' AS field_name,
+        country_code as value,
+        SUM(view_count) as view_count
+    FROM pageview_hour
+    GROUP BY country_code
+
+    -- distinct access_method
+    SELECT
+        'access_method' AS field_name,
+        access_method as value,
+        SUM(view_count) as view_count
+    FROM pageview_hour
+    GROUP BY access_method
+
+    UNION ALL
+
+    -- distinct agent_type
+    SELECT
+        'agent_type' AS field_name,
+        agent_type as value,
+        SUM(view_count) as view_count
+    FROM pageview_hour
+    GROUP BY agent_type
+
+)
+
+-- Write in destination table partition the data that didn't match the join
+INSERT OVERWRITE TABLE ${destination_table}
+    PARTITION(year=${year},month=${month},day=${day},hour=${hour})
+SELECT
+    -- Since the whitelist is small enough to fit in RAM, strongly improve
+    -- efficiency using a Map-side join
+    /*+ MAPJOIN(${whitelist_table}) */
+    distinct_values.field_name,
+    distinct_values.value,
+    distinct_values.view_count
+FROM distinct_values
+    LEFT JOIN ${whitelist_table} AS whitelist ON (
+        distinct_values.field_name = whitelist.field_name
+        AND distinct_values.value = whitelist.authorized_value)
+WHERE
+    -- Keep only non-matching values
+    whitelist.field_name IS NULL;
diff --git a/oozie/pageview/hourly/workflow.xml 
b/oozie/pageview/hourly/workflow.xml
index 8850abf..a2aacc8 100644
--- a/oozie/pageview/hourly/workflow.xml
+++ b/oozie/pageview/hourly/workflow.xml
@@ -21,10 +21,16 @@
 
         <!-- Aggregation related configuration properties-->
         <property>
-            <name>hive_script</name>
+            <name>hive_pageview_script</name>
             <!-- This is relative to the containing directory of this file. -->
             <value>pageview_hourly.hql</value>
-            <description>Hive script to run.</description>
+            <description>Hive pageview script to run.</description>
+        </property>
+        <property>
+            <name>hive_whitelist_check_script</name>
+            <!-- This is relative to the containing directory of this file. -->
+            <value>pageview_whitelist_check.hql</value>
+            <description>Hive pageview whitelist check script to 
run.</description>
         </property>
 
         <property>
@@ -32,12 +38,20 @@
             <description>hive-site.xml file path in HDFS</description>
         </property>
         <property>
-            <name>source_table</name>
-            <description>Hive table to refine</description>
+            <name>webrequest_table</name>
+            <description>Hive webrequest refined table</description>
         </property>
         <property>
-            <name>destination_table</name>
-            <description>The destinaton table to store refined data 
in.</description>
+            <name>pageview_table</name>
+            <description>Hive pageview table to store hourly 
data.</description>
+        </property>
+        <property>
+            <name>whitelist_table</name>
+            <description>Hive pageview whitelist table for check</description>
+        </property>
+        <property>
+            <name>unexpected_values_table</name>
+            <description>Hive pageview unexpected values from whitelist 
check</description>
         </property>
         <property>
             <name>year</name>
@@ -60,8 +74,12 @@
             <description>Workflow for marking a directory done</description>
         </property>
         <property>
-            <name>destination_dataset_directory</name>
+            <name>pageview_dataset_directory</name>
             <description>Directory to generate the done flag in</description>
+        </property>
+        <property>
+            <name>send_error_email_workflow_file</name>
+            <description>Workflow for sending an email</description>
         </property>
     </parameters>
 
@@ -92,9 +110,9 @@
                 </property>
             </configuration>
 
-            <script>${hive_script}</script>
-            <param>source_table=${source_table}</param>
-            <param>destination_table=${destination_table}</param>
+            <script>${hive_pageview_script}</script>
+            <param>source_table=${webrequest_table}</param>
+            <param>destination_table=${pageview_table}</param>
             <param>record_version=${record_version}</param>
             <param>year=${year}</param>
             <param>month=${month}</param>
@@ -102,24 +120,123 @@
             <param>hour=${hour}</param>
         </hive>
 
-        <ok to="mark_dataset_done"/>
-        <error to="kill"/>
+        <ok to="mark_pageview_dataset_done"/>
+        <error to="send_error_email"/>
     </action>
 
-    <action name="mark_dataset_done">
+    <action name="mark_pageview_dataset_done">
         <sub-workflow>
             <app-path>${mark_directory_done_workflow_file}</app-path>
             <configuration>
                 <property>
                     <name>directory</name>
-                    <value>${destination_dataset_directory}</value>
+                    <value>${pageview_dataset_directory}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="whitelist_check"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="whitelist_check">
+        <hive xmlns="uri:oozie:hive-action:0.2">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>hive.exec.scratchdir</name>
+                    <value>/tmp/hive-${user}</value>
+                </property>
+            </configuration>
+
+            <script>${hive_whitelist_check_script}</script>
+            <param>source_table=${pageview_table}</param>
+            <param>whitelist_table=${whitelist_table}</param>
+            <param>destination_table=${unexpected_values_table}</param>
+            <param>year=${year}</param>
+            <param>month=${month}</param>
+            <param>day=${day}</param>
+            <param>hour=${hour}</param>
+        </hive>
+
+        <ok to="mark_unexpected_values_dataset_done"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="mark_unexpected_values_dataset_done">
+        <sub-workflow>
+            <app-path>${mark_directory_done_workflow_file}</app-path>
+            <configuration>
+                <property>
+                    <name>directory</name>
+                    <value>${unexpected_values_dataset_directory}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="empty_result_check"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <decision name="empty_result_check">
+        <switch>
+            <case to="send_whitelist_email">
+              ${fs:dirSize(unexpected_values_dataset_directory) eq 0}
+            </case>
+            <default to="end"/>
+        </switch>
+    </decision>
+
+
+    <action name="send_whitelist_email">
+        <sub-workflow>
+            <app-path>${send_error_email_workflow_file}</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>parent_name</name>
+                    <value>${wf:name()}</value>
+                </property>
+                <property>
+                    <name>subject</name>
+                    <value>Unexpected values in pageview for workflow - 
${wf:name()}</value>
                 </property>
             </configuration>
         </sub-workflow>
         <ok to="end"/>
+        <error to="send_error_email"/>
+    </action>
+
+    <action name="send_error_email">
+        <sub-workflow>
+            <app-path>${send_error_email_workflow_file}</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>parent_name</name>
+                    <value>${wf:name()}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="kill"/>
         <error to="kill"/>
     </action>
 
+
+
     <kill name="kill">
         <message>Action failed, error 
message[${wf:errorMessage(wf:lastErrorNode())}]</message>
     </kill>
diff --git a/static_data/pageview/whitelist/whitelist.tsv 
b/static_data/pageview/whitelist/whitelist.tsv
new file mode 100644
index 0000000..cc7bed2
--- /dev/null
+++ b/static_data/pageview/whitelist/whitelist.tsv
@@ -0,0 +1,6 @@
+agent_type     user    2015-09-18 15:30:00
+agent_type     spider  2015-09-18 15:30:00
+agent_type     bot     2015-09-18 15:30:00
+access_method  desktop 2015-09-18 15:30:00
+access_method  mobile web      2015-09-18 15:30:00
+access_method  mobile app      2015-09-18 15:30:00
\ No newline at end of file

-- 
To view, visit https://gerrit.wikimedia.org/r/240099
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I3bf4f6c42c04843d33df10794f29c54075644156
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <j...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to