Joal has submitted this change and it was merged.

Change subject: Remove mobile webrequest_source merging it in text
......................................................................


Remove mobile webrequest_source merging it in text

Bug: T122651
Change-Id: I2c356adca2c4198d33b0c4c2eeb9d2df010e12cb
---
M bin/refinery-dump-status-webrequest-partitions
M oozie/mobile_apps/session_metrics/coordinator.properties
M oozie/mobile_apps/session_metrics/coordinator.xml
M oozie/mobile_apps/uniques/daily/coordinator.xml
M oozie/mobile_apps/uniques/daily/generate_uniques_daily.hql
M oozie/mobile_apps/uniques/monthly/coordinator.xml
M oozie/mobile_apps/uniques/monthly/generate_uniques_monthly.hql
M oozie/pagecounts-all-sites/load/coordinator.xml
M oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
M oozie/pageview/hourly/coordinator.xml
M oozie/pageview/hourly/pageview_hourly.hql
M oozie/util/hive/partition/add/workflow.properties
M oozie/webrequest/datasets.xml
M oozie/webrequest/datasets_raw.xml
M oozie/webrequest/legacy_tsvs/bundle.properties
M oozie/webrequest/legacy_tsvs/bundle.xml
D oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
R oozie/webrequest/legacy_tsvs/coordinator_misc_text.xml
D oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
C oozie/webrequest/legacy_tsvs/coordinator_text_upload.xml
M oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
M oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
M oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
M oozie/webrequest/legacy_tsvs/workflow.xml
M oozie/webrequest/load/bundle.xml
M oozie/webrequest/refine/bundle.xml
M python/refinery/util.py
M python/tests/test_refinery/test_util.py
28 files changed, 51 insertions(+), 405 deletions(-)

Approvals:
  Joal: Verified; Looks good to me, approved



diff --git a/bin/refinery-dump-status-webrequest-partitions 
b/bin/refinery-dump-status-webrequest-partitions
index e5ea750..914c739 100755
--- a/bin/refinery-dump-status-webrequest-partitions
+++ b/bin/refinery-dump-status-webrequest-partitions
@@ -75,12 +75,12 @@
     DATASET_VISIBILITIES["$DATASET"]=no
 }
 
-add_dataset "legacy_tsvs" "daily" "    5xx    | 5xx-misc  |5xx-mobile | 
5xx-text  |5xx-upload |    api    |   edits   | glam_nara |  mobile   |  
sampled  |   zero    |"
+add_dataset "legacy_tsvs" "daily" "    5xx    | 5xx-misc  | 5xx-text  
|5xx-upload |    api    |   edits   | glam_nara |  sampled  |   zero    |"
 add_dataset "mediacounts" "daily" "   full  | top1000 |"
 add_dataset "pagecounts_all_sites" "hourly" " file name date  |  page   | 
project |"
 add_dataset "pagecounts_raw" "hourly" " file name date  |  page   | project |"
-add_dataset "raw_webrequest" "hourly" "     maps    |     misc    |    mobile  
 |     text    |    upload   |"
-add_dataset "webrequest" "hourly" "  maps  |  misc  | mobile |  text  | upload 
|"
+add_dataset "raw_webrequest" "hourly" "     maps    |     misc    |     text   
 |    upload   |"
+add_dataset "webrequest" "hourly" "  maps  |  misc  |  text  | upload |"
 add_dataset "pageview" "hourly" "  hourly  |"
 add_dataset "projectview" "hourly" "   hourly    |"
 
@@ -373,13 +373,11 @@
     for BASE in \
         5xx/5xx \
         5xx-misc/5xx-misc \
-        5xx-mobile/5xx-mobile \
         5xx-text/5xx-text \
         5xx-upload/5xx-upload \
         api/api-usage \
         edits/edits \
         glam_nara/glam_nara \
-        mobile/mobile-sampled-100 \
         sampled/sampled-1000 \
         zero/zero \
 
@@ -492,7 +490,7 @@
 
     local DATE_HDFS_PADDED="$(date --utc -d "$DATE" +'%Y/%m/%d/%H')"
 
-    for SOURCE in maps misc mobile text upload
+    for SOURCE in maps misc text upload
     do
         log_no_lf "    "
         dump_dataset_raw_webrequest_partition "$DATE_HDFS_PADDED" "$SOURCE"
@@ -506,7 +504,7 @@
     local DATE_DIRS_REL="$(date --utc -d "$DATE" 
+'year=%Y/month=%m/day=%d/hour=%H')"
     DATE_DIRS_REL="${DATE_DIRS_REL//=0/=}"
 
-    for SOURCE in maps misc mobile text upload
+    for SOURCE in maps misc text upload
     do
         local STATUS="X"
         
SUCCESS_FILE_ABS="$WEBREQUEST_DATA_DIR_ABS/webrequest_source=$SOURCE/$DATE_DIRS_REL/_SUCCESS"
diff --git a/oozie/mobile_apps/session_metrics/coordinator.properties 
b/oozie/mobile_apps/session_metrics/coordinator.properties
index 9aca326..4a43857 100644
--- a/oozie/mobile_apps/session_metrics/coordinator.properties
+++ b/oozie/mobile_apps/session_metrics/coordinator.properties
@@ -1,5 +1,5 @@
 # Configures a coordinator to automatically manage generating app session 
metrics from
-# the refined webrequest mobile data. Any of the following properties are 
overidable with -D.
+# the refined webrequest text data. Any of the following properties are 
overidable with -D.
 # Usage:
 #   oozie job -Duser=$USER -Dstart_time=2015-05-01T00:00Z -submit -config 
oozie/mobile_apps/session_metrics/coordinator.properties
 #
diff --git a/oozie/mobile_apps/session_metrics/coordinator.xml 
b/oozie/mobile_apps/session_metrics/coordinator.xml
index 197db0f..67593b8 100644
--- a/oozie/mobile_apps/session_metrics/coordinator.xml
+++ b/oozie/mobile_apps/session_metrics/coordinator.xml
@@ -50,7 +50,7 @@
     </datasets>
 
     <input-events>
-        <data-in name="mobile" dataset="webrequest_mobile">
+        <data-in name="text" dataset="webrequest_text">
             <!-- 30 days of data in hours -->
             <start-instance>${coord:current(0)}</start-instance>
             <end-instance>${coord:current(24 * 30 - 1)}</end-instance>
diff --git a/oozie/mobile_apps/uniques/daily/coordinator.xml 
b/oozie/mobile_apps/uniques/daily/coordinator.xml
index d0877b4..7941239 100644
--- a/oozie/mobile_apps/uniques/daily/coordinator.xml
+++ b/oozie/mobile_apps/uniques/daily/coordinator.xml
@@ -40,13 +40,9 @@
 
     <input-events>
         <!--
-            Please see datasets definition, the webrequest_mobile
-            and webrequest_text are refined datasets from the raw data.
+            Please see datasets definition webrequest_text is a
+            refined dataset from the raw data.
         -->
-        <data-in name="mobile" dataset="webrequest_mobile">
-            <start-instance>${coord:current(0)}</start-instance>
-            <end-instance>${coord:current(23)}</end-instance>
-        </data-in>
         <data-in name="text" dataset="webrequest_text">
             <start-instance>${coord:current(0)}</start-instance>
             <end-instance>${coord:current(23)}</end-instance>
diff --git a/oozie/mobile_apps/uniques/daily/generate_uniques_daily.hql 
b/oozie/mobile_apps/uniques/daily/generate_uniques_daily.hql
index 4022c17..ba32fa6 100644
--- a/oozie/mobile_apps/uniques/daily/generate_uniques_daily.hql
+++ b/oozie/mobile_apps/uniques/daily/generate_uniques_daily.hql
@@ -60,14 +60,14 @@
         month,
         day,
         CASE WHEN user_agent LIKE('%iPhone%') THEN 'iOS' ELSE 'Android' END AS 
platform,
-        COALESCE(x_analytics_map['wmfuuid'], 
+        COALESCE(x_analytics_map['wmfuuid'],
                  parse_url(concat('http://bla.org/woo/', uri_query), 'QUERY', 
'appInstallID')) AS uuid
     FROM ${source_table}
     WHERE user_agent LIKE('WikipediaApp%')
         AND parse_url(concat('http://bla.org/woo/', uri_query), 'QUERY', 
'action') = 'mobileview'
-        AND COALESCE(x_analytics_map['wmfuuid'], 
+        AND COALESCE(x_analytics_map['wmfuuid'],
                      parse_url(concat('http://bla.org/woo/', uri_query), 
'QUERY', 'appInstallID')) IS NOT NULL
-        AND webrequest_source IN ('mobile','text')
+        AND webrequest_source IN ('text')
         AND year=${year}
         AND month=${month}
         AND day=${day}
diff --git a/oozie/mobile_apps/uniques/monthly/coordinator.xml 
b/oozie/mobile_apps/uniques/monthly/coordinator.xml
index e2a7b55..bc515ae 100644
--- a/oozie/mobile_apps/uniques/monthly/coordinator.xml
+++ b/oozie/mobile_apps/uniques/monthly/coordinator.xml
@@ -42,13 +42,9 @@
 
     <input-events>
         <!--
-            Please see datasets definition, the webrequest_mobile
-            and webrequest_text are refined datasets from the raw data.
+            Please see datasets definition webrequest_text is a
+            refined dataset from the raw data.
         -->
-        <data-in name="mobile" dataset="webrequest_mobile">
-            <start-instance>${coord:current(0)}</start-instance>
-            <end-instance>${coord:current(coord:daysInMonth(0) * 24 - 
1)}</end-instance>
-        </data-in>
         <data-in name="text" dataset="webrequest_text">
             <start-instance>${coord:current(0)}</start-instance>
             <end-instance>${coord:current(coord:daysInMonth(0) * 24 - 
1)}</end-instance>
diff --git a/oozie/mobile_apps/uniques/monthly/generate_uniques_monthly.hql 
b/oozie/mobile_apps/uniques/monthly/generate_uniques_monthly.hql
index 566e48a..1d263c0 100644
--- a/oozie/mobile_apps/uniques/monthly/generate_uniques_monthly.hql
+++ b/oozie/mobile_apps/uniques/monthly/generate_uniques_monthly.hql
@@ -57,14 +57,14 @@
         year,
         month,
         CASE WHEN user_agent LIKE('%iPhone%') THEN 'iOS' ELSE 'Android' END AS 
platform,
-        COALESCE(x_analytics_map['wmfuuid'], 
+        COALESCE(x_analytics_map['wmfuuid'],
                  parse_url(concat('http://bla.org/woo/', uri_query), 'QUERY', 
'appInstallID')) AS uuid
     FROM ${source_table}
     WHERE user_agent LIKE('WikipediaApp%')
         AND parse_url(concat('http://bla.org/woo/', uri_query), 'QUERY', 
'action') = 'mobileview'
-        AND COALESCE(x_analytics_map['wmfuuid'], 
+        AND COALESCE(x_analytics_map['wmfuuid'],
                      parse_url(concat('http://bla.org/woo/', uri_query), 
'QUERY', 'appInstallID')) IS NOT NULL
-        AND webrequest_source IN ('mobile','text')
+        AND webrequest_source IN ('text')
         AND year=${year}
         AND month=${month}
 )
diff --git a/oozie/pagecounts-all-sites/load/coordinator.xml 
b/oozie/pagecounts-all-sites/load/coordinator.xml
index 0ea0317..b570fd6 100644
--- a/oozie/pagecounts-all-sites/load/coordinator.xml
+++ b/oozie/pagecounts-all-sites/load/coordinator.xml
@@ -59,9 +59,6 @@
         <data-in name="input_text" dataset="webrequest_text">
             <instance>${coord:current(0)}</instance>
         </data-in>
-        <data-in name="input_mobile" dataset="webrequest_mobile">
-            <instance>${coord:current(0)}</instance>
-        </data-in>
     </input-events>
 
     <output-events>
diff --git a/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql 
b/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
index cf41277..0fc9a02 100644
--- a/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
+++ b/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
@@ -44,7 +44,7 @@
             response_size
         FROM ${source_table}
         WHERE
-            webrequest_source IN ('text', 'mobile')
+            webrequest_source IN ('text')
             AND year=${year}
             AND month=${month}
             AND day=${day}
diff --git a/oozie/pageview/hourly/coordinator.xml 
b/oozie/pageview/hourly/coordinator.xml
index cee8305..f79a945 100644
--- a/oozie/pageview/hourly/coordinator.xml
+++ b/oozie/pageview/hourly/coordinator.xml
@@ -79,9 +79,6 @@
     </datasets>
 
     <input-events>
-        <data-in name="mobile_refined_input" dataset="webrequest_mobile">
-            <instance>${coord:current(0)}</instance>
-        </data-in>
         <data-in name="text_refined_input" dataset="webrequest_text">
             <instance>${coord:current(0)}</instance>
         </data-in>
diff --git a/oozie/pageview/hourly/pageview_hourly.hql 
b/oozie/pageview/hourly/pageview_hourly.hql
index 95749c7..017d8eb 100644
--- a/oozie/pageview/hourly/pageview_hourly.hql
+++ b/oozie/pageview/hourly/pageview_hourly.hql
@@ -49,7 +49,7 @@
         page_id
     FROM
         ${source_table}
-    WHERE webrequest_source IN ('text', 'mobile') AND
+    WHERE webrequest_source IN ('text') AND
         year=${year} AND month=${month} AND day=${day} AND hour=${hour}
         AND is_pageview = TRUE
         AND COALESCE(pageview_info['project'], '') != ''
diff --git a/oozie/util/hive/partition/add/workflow.properties 
b/oozie/util/hive/partition/add/workflow.properties
index f210622..116da72 100644
--- a/oozie/util/hive/partition/add/workflow.properties
+++ b/oozie/util/hive/partition/add/workflow.properties
@@ -2,8 +2,8 @@
 # Any of the following properties are overidable with -D.  Some properties
 # are required to be set via the CLI: 'location' and 'partition_spec'.
 #
-# 
-Dlocation="hdfs://namenode.example.org:8020/path/to/data/directory/webrequest/webrequest_mobile/2014/04/02/01"
-# 
-Dpartition_spec="webrequest_source='mobile',year=2014,month=04,day=02,hour=01"
+# 
-Dlocation="hdfs://namenode.example.org:8020/path/to/data/directory/webrequest/webrequest_text/2014/04/02/01"
+# -Dpartition_spec="webrequest_source='text',year=2014,month=04,day=02,hour=01"
 
 
 name_node                         = hdfs://analytics-hadoop
diff --git a/oozie/webrequest/datasets.xml b/oozie/webrequest/datasets.xml
index 81e77a4..3925db3 100644
--- a/oozie/webrequest/datasets.xml
+++ b/oozie/webrequest/datasets.xml
@@ -44,14 +44,6 @@
         <done-flag>_SUCCESS</done-flag>
     </dataset>
 
-    <dataset name="webrequest_mobile"
-             frequency="${coord:hours(1)}"
-             initial-instance="${start_time}"
-             timezone="Universal">
-        
<uri-template>${webrequest_data_directory}/webrequest_source=mobile/year=${YEAR}/month=${"$"}{MONTH
 + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
-        <done-flag>_SUCCESS</done-flag>
-    </dataset>
-
     <dataset name="webrequest_text"
              frequency="${coord:hours(1)}"
              initial-instance="${start_time}"
diff --git a/oozie/webrequest/datasets_raw.xml 
b/oozie/webrequest/datasets_raw.xml
index bc9e872..f8580ab 100644
--- a/oozie/webrequest/datasets_raw.xml
+++ b/oozie/webrequest/datasets_raw.xml
@@ -29,13 +29,6 @@
         
<uri-template>${webrequest_raw_data_directory}/webrequest_misc/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
         <done-flag>_IMPORTED</done-flag>
     </dataset>
-    <dataset name="webrequest_mobile_raw_unchecked"
-             frequency="${coord:hours(1)}"
-             initial-instance="${start_time}"
-             timezone="Universal">
-        
<uri-template>${webrequest_raw_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
-        <done-flag>_IMPORTED</done-flag>
-    </dataset>
     <dataset name="webrequest_text_raw_unchecked"
              frequency="${coord:hours(1)}"
              initial-instance="${start_time}"
@@ -69,13 +62,6 @@
              initial-instance="${start_time}"
              timezone="Universal">
         
<uri-template>${webrequest_raw_data_directory}/webrequest_misc/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
-        <done-flag>_PARTITIONED</done-flag>
-    </dataset>
-    <dataset name="webrequest_mobile_raw_partitioned"
-             frequency="${coord:hours(1)}"
-             initial-instance="${start_time}"
-             timezone="Universal">
-        
<uri-template>${webrequest_raw_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
         <done-flag>_PARTITIONED</done-flag>
     </dataset>
     <dataset name="webrequest_text_raw_partitioned"
@@ -114,13 +100,6 @@
              initial-instance="${start_time}"
              timezone="Universal">
         
<uri-template>${webrequest_raw_data_directory}/webrequest_misc/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
-        <done-flag>_SUCCESS</done-flag>
-    </dataset>
-    <dataset name="webrequest_mobile_raw"
-             frequency="${coord:hours(1)}"
-             initial-instance="${start_time}"
-             timezone="Universal">
-        
<uri-template>${webrequest_raw_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
         <done-flag>_SUCCESS</done-flag>
     </dataset>
     <dataset name="webrequest_text_raw"
diff --git a/oozie/webrequest/legacy_tsvs/bundle.properties 
b/oozie/webrequest/legacy_tsvs/bundle.properties
index 9f13559..50269ae 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.properties
+++ b/oozie/webrequest/legacy_tsvs/bundle.properties
@@ -33,9 +33,8 @@
 # and have the coordinators that depend on upload block, while the coordinators
 # that do not depend on upload continue to run.
 coordinator_misc_file                  = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc.xml
-coordinator_misc_mobile_text_file      = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
-coordinator_mobile_text_file           = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text.xml
-coordinator_mobile_text_upload_file    = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
+coordinator_misc_text_file             = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc_text.xml
+coordinator_text_upload_file           = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_text_upload.xml
 coordinator_text_file                  = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_text.xml
 coordinator_upload_file                = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_upload.xml
 
diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml 
b/oozie/webrequest/legacy_tsvs/bundle.xml
index dad44fd..654bfde 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.xml
+++ b/oozie/webrequest/legacy_tsvs/bundle.xml
@@ -7,9 +7,8 @@
         <!-- Required properties -->
         <property><name>queue_name</name></property>
         <property><name>coordinator_misc_file</name></property>
-        <property><name>coordinator_misc_mobile_text_file</name></property>
-        <property><name>coordinator_mobile_text_file</name></property>
-        <property><name>coordinator_mobile_text_upload_file</name></property>
+        <property><name>coordinator_misc_text_file</name></property>
+        <property><name>coordinator_text_upload_file</name></property>
         <property><name>coordinator_text_file</name></property>
         <property><name>coordinator_upload_file</name></property>
         <property><name>name_node</name></property>
@@ -33,7 +32,7 @@
         No 'misc', as the sampled-1000 was last produced on erbium's
         udp2log stream, which did not receive those two webrequest_sources.
         -->
-        <app-path>${coordinator_mobile_text_upload_file}</app-path>
+        <app-path>${coordinator_text_upload_file}</app-path>
         <configuration>
             <property>
                 <name>aspect_name</name>
@@ -51,7 +50,7 @@
     </coordinator>
 
     <coordinator name="webrequest_legacy_tsvs-api-usage">
-        <app-path>${coordinator_mobile_text_file}</app-path>
+        <app-path>${coordinator_text_file}</app-path>
         <configuration>
             <property>
                 <name>aspect_name</name>
@@ -69,7 +68,7 @@
     </coordinator>
 
     <coordinator name="webrequest_legacy_tsvs-glam_nara">
-        <app-path>${coordinator_mobile_text_upload_file}</app-path>
+        <app-path>${coordinator_text_upload_file}</app-path>
         <configuration>
             <property>
                 <name>aspect_name</name>
@@ -87,7 +86,7 @@
     </coordinator>
 
     <coordinator name="webrequest_legacy_tsvs-edits">
-        <app-path>${coordinator_mobile_text_file}</app-path>
+        <app-path>${coordinator_text_file}</app-path>
         <configuration>
             <property>
                 <name>aspect_name</name>
@@ -109,7 +108,7 @@
         No 'upload', as that explicitly got excluded in the upd2log
         filters.
         -->
-        <app-path>${coordinator_misc_mobile_text_file}</app-path>
+        <app-path>${coordinator_misc_text_file}</app-path>
         <configuration>
             <property>
                 <name>aspect_name</name>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml 
b/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
deleted file mode 100644
index 6ec1bfc..0000000
--- a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
+++ /dev/null
@@ -1,145 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<coordinator-app xmlns="uri:oozie:coordinator:0.4"
-    name="webrequest_legacy_tsvs-${aspect_name}-coord"
-    frequency="${coord:days(1)}"
-    start="${start_time}"
-    end="${stop_time}"
-    timezone="Universal">
-
-    <parameters>
-
-        <!-- Required properties -->
-        <property><name>queue_name</name></property>
-        <property><name>name_node</name></property>
-        <property><name>job_tracker</name></property>
-        <property><name>start_time</name></property>
-        <property><name>hour_offset</name></property>
-        <property><name>stop_time</name></property>
-        <property><name>webrequest_datasets_file</name></property>
-        <property><name>webrequest_data_directory</name></property>
-        <property><name>hive_site_xml</name></property>
-        <property><name>artifacts_directory</name></property>
-        <property><name>workflow_file</name></property>
-        <property><name>webrequest_table</name></property>
-        <property><name>mark_directory_done_workflow_file</name></property>
-        <property><name>temporary_directory</name></property>
-        <property><name>aspect_tsv_archive_directory</name></property>
-        <property><name>archive_job_output_workflow_file</name></property>
-        <property><name>aspect_name</name></property>
-    </parameters>
-
-    <controls>
-        <!--
-        By having materialized jobs not timeout, we ease backfilling incidents
-        after recoverable hiccups on the dataset producers.
-        -->
-        <timeout>-1</timeout>
-
-        <!--
-        Since the job only runs daily, even low concurrency allows to catch up
-        pretty fast. Hence, we can limit concurrency to 1, as the tsvs 
typically
-        process quite some data.
-        -->
-        <concurrency>1</concurrency>
-
-        <!--
-        In order to keep backfilling after an incident simple, we only start
-        throttling materialization after 4 days.
-        Due to the low concurrency, and low discrepancy between progressing
-        time, and expected availability of datasets, we should typically have
-        far less materialized jobs.
-        -->
-        <throttle>4</throttle>
-    </controls>
-
-    <datasets>
-        <include>${webrequest_datasets_file}</include>
-    </datasets>
-
-    <input-events>
-
-        <data-in name="webrequest_misc" dataset="webrequest_misc">
-            <start-instance>${coord:current(0)}</start-instance>
-            <end-instance>${coord:current(23)}</end-instance>
-        </data-in>
-
-        <data-in name="webrequest_mobile" dataset="webrequest_mobile">
-            <start-instance>${coord:current(0)}</start-instance>
-            <end-instance>${coord:current(23)}</end-instance>
-        </data-in>
-
-        <data-in name="webrequest_text" dataset="webrequest_text">
-            <start-instance>${coord:current(0)}</start-instance>
-            <end-instance>${coord:current(23)}</end-instance>
-        </data-in>
-
-        <!--
-        The following dataset is not required as input to the
-        workflow, but only helps to delay running it.
-
-        The 24 hours offset is for a full day. And we subtract 2 hours, as
-        webrequest processing starts 2 hours after the respective hour.
-        -->
-        <data-in name="delay" dataset="webrequest_text">
-            <instance>${coord:current(24-2+hour_offset)}</instance>
-        </data-in>
-    </input-events>
-
-    <action>
-        <workflow>
-            <app-path>${workflow_file}</app-path>
-            <configuration>
-
-                <!-- Pass these properties through to the workflow -->
-                
<property><name>name_node</name><value>${name_node}</value></property>
-                
<property><name>job_tracker</name><value>${job_tracker}</value></property>
-                
<property><name>queue_name</name><value>${queue_name}</value></property>
-
-                <property>
-                    <name>hive_site_xml</name>
-                    <value>${hive_site_xml}</value>
-                </property>
-                <property>
-                    <name>artifacts_directory</name>
-                    <value>${artifacts_directory}</value>
-                </property>
-                <property>
-                    <name>webrequest_table</name>
-                    <value>${webrequest_table}</value>
-                </property>
-                <property>
-                    <name>year</name>
-                    <value>${coord:formatTime(coord:nominalTime(), 
"yyyy")}</value>
-                </property>
-                <property>
-                    <name>month</name>
-                    <value>${coord:formatTime(coord:nominalTime(), 
"MM")}</value>
-                </property>
-                <property>
-                    <name>day</name>
-                    <value>${coord:formatTime(coord:nominalTime(), 
"dd")}</value>
-                </property>
-                <property>
-                    <name>mark_directory_done_workflow_file</name>
-                    <value>${mark_directory_done_workflow_file}</value>
-                </property>
-                <property>
-                    <name>temporary_directory</name>
-                    <value>${temporary_directory}</value>
-                </property>
-                <property>
-                    <name>aspect_name</name>
-                    <value>${aspect_name}</value>
-                </property>
-                <property>
-                    <name>aspect_tsv_archive_directory</name>
-                    <value>${aspect_tsv_archive_directory}</value>
-                </property>
-                <property>
-                    <name>archive_job_output_workflow_file</name>
-                    <value>${archive_job_output_workflow_file}</value>
-                </property>
-            </configuration>
-        </workflow>
-    </action>
-</coordinator-app>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml 
b/oozie/webrequest/legacy_tsvs/coordinator_misc_text.xml
similarity index 98%
rename from oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
rename to oozie/webrequest/legacy_tsvs/coordinator_misc_text.xml
index 323fed6..6803d89 100644
--- a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
+++ b/oozie/webrequest/legacy_tsvs/coordinator_misc_text.xml
@@ -57,7 +57,8 @@
     </datasets>
 
     <input-events>
-        <data-in name="webrequest_mobile" dataset="webrequest_mobile">
+
+        <data-in name="webrequest_misc" dataset="webrequest_misc">
             <start-instance>${coord:current(0)}</start-instance>
             <end-instance>${coord:current(23)}</end-instance>
         </data-in>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml 
b/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
deleted file mode 100644
index 3347b56..0000000
--- a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
+++ /dev/null
@@ -1,144 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<coordinator-app xmlns="uri:oozie:coordinator:0.4"
-    name="webrequest_legacy_tsvs-${aspect_name}-coord"
-    frequency="${coord:days(1)}"
-    start="${start_time}"
-    end="${stop_time}"
-    timezone="Universal">
-
-    <parameters>
-
-        <!-- Required properties -->
-        <property><name>queue_name</name></property>
-        <property><name>name_node</name></property>
-        <property><name>job_tracker</name></property>
-        <property><name>start_time</name></property>
-        <property><name>hour_offset</name></property>
-        <property><name>stop_time</name></property>
-        <property><name>webrequest_datasets_file</name></property>
-        <property><name>webrequest_data_directory</name></property>
-        <property><name>hive_site_xml</name></property>
-        <property><name>artifacts_directory</name></property>
-        <property><name>workflow_file</name></property>
-        <property><name>webrequest_table</name></property>
-        <property><name>mark_directory_done_workflow_file</name></property>
-        <property><name>temporary_directory</name></property>
-        <property><name>aspect_tsv_archive_directory</name></property>
-        <property><name>archive_job_output_workflow_file</name></property>
-        <property><name>aspect_name</name></property>
-    </parameters>
-
-    <controls>
-        <!--
-        By having materialized jobs not timeout, we ease backfilling incidents
-        after recoverable hiccups on the dataset producers.
-        -->
-        <timeout>-1</timeout>
-
-        <!--
-        Since the job only runs daily, even low concurrency allows to catch up
-        pretty fast. Hence, we can limit concurrency to 1, as the tsvs 
typically
-        process quite some data.
-        -->
-        <concurrency>1</concurrency>
-
-        <!--
-        In order to keep backfilling after an incident simple, we only start
-        throttling materialization after 4 days.
-        Due to the low concurrency, and low discrepancy between progressing
-        time, and expected availability of datasets, we should typically have
-        far less materialized jobs.
-        -->
-        <throttle>4</throttle>
-    </controls>
-
-    <datasets>
-        <include>${webrequest_datasets_file}</include>
-    </datasets>
-
-    <input-events>
-        <data-in name="webrequest_mobile" dataset="webrequest_mobile">
-            <start-instance>${coord:current(0)}</start-instance>
-            <end-instance>${coord:current(23)}</end-instance>
-        </data-in>
-
-        <data-in name="webrequest_text" dataset="webrequest_text">
-            <start-instance>${coord:current(0)}</start-instance>
-            <end-instance>${coord:current(23)}</end-instance>
-        </data-in>
-
-        <data-in name="webrequest_upload" dataset="webrequest_upload">
-            <start-instance>${coord:current(0)}</start-instance>
-            <end-instance>${coord:current(23)}</end-instance>
-        </data-in>
-
-        <!--
-        The following dataset is not required as input to the
-        workflow, but only helps to delay running it.
-
-        The 24 hours offset is for a full day. And we subtract 2 hours, as
-        webrequest processing starts 2 hours after the respective hour.
-        -->
-        <data-in name="delay" dataset="webrequest_text">
-            <instance>${coord:current(24-2+hour_offset)}</instance>
-        </data-in>
-    </input-events>
-
-    <action>
-        <workflow>
-            <app-path>${workflow_file}</app-path>
-            <configuration>
-
-                <!-- Pass these properties through to the workflow -->
-                
<property><name>name_node</name><value>${name_node}</value></property>
-                
<property><name>job_tracker</name><value>${job_tracker}</value></property>
-                
<property><name>queue_name</name><value>${queue_name}</value></property>
-
-                <property>
-                    <name>hive_site_xml</name>
-                    <value>${hive_site_xml}</value>
-                </property>
-                <property>
-                    <name>artifacts_directory</name>
-                    <value>${artifacts_directory}</value>
-                </property>
-                <property>
-                    <name>webrequest_table</name>
-                    <value>${webrequest_table}</value>
-                </property>
-                <property>
-                    <name>year</name>
-                    <value>${coord:formatTime(coord:nominalTime(), 
"yyyy")}</value>
-                </property>
-                <property>
-                    <name>month</name>
-                    <value>${coord:formatTime(coord:nominalTime(), 
"MM")}</value>
-                </property>
-                <property>
-                    <name>day</name>
-                    <value>${coord:formatTime(coord:nominalTime(), 
"dd")}</value>
-                </property>
-                <property>
-                    <name>mark_directory_done_workflow_file</name>
-                    <value>${mark_directory_done_workflow_file}</value>
-                </property>
-                <property>
-                    <name>temporary_directory</name>
-                    <value>${temporary_directory}</value>
-                </property>
-                <property>
-                    <name>aspect_name</name>
-                    <value>${aspect_name}</value>
-                </property>
-                <property>
-                    <name>aspect_tsv_archive_directory</name>
-                    <value>${aspect_tsv_archive_directory}</value>
-                </property>
-                <property>
-                    <name>archive_job_output_workflow_file</name>
-                    <value>${archive_job_output_workflow_file}</value>
-                </property>
-            </configuration>
-        </workflow>
-    </action>
-</coordinator-app>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml 
b/oozie/webrequest/legacy_tsvs/coordinator_text_upload.xml
similarity index 98%
copy from oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
copy to oozie/webrequest/legacy_tsvs/coordinator_text_upload.xml
index 323fed6..988ba89 100644
--- a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
+++ b/oozie/webrequest/legacy_tsvs/coordinator_text_upload.xml
@@ -57,12 +57,13 @@
     </datasets>
 
     <input-events>
-        <data-in name="webrequest_mobile" dataset="webrequest_mobile">
+
+        <data-in name="webrequest_text" dataset="webrequest_text">
             <start-instance>${coord:current(0)}</start-instance>
             <end-instance>${coord:current(23)}</end-instance>
         </data-in>
 
-        <data-in name="webrequest_text" dataset="webrequest_text">
+        <data-in name="webrequest_upload" dataset="webrequest_upload">
             <start-instance>${coord:current(0)}</start-instance>
             <end-instance>${coord:current(23)}</end-instance>
         </data-in>
diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql 
b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
index 69d5651..35f971c 100644
--- a/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
@@ -2,7 +2,7 @@
 SET 
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
 --^ To work around HIVE-3296, we have SETs before any comments
 
--- Generates a TSV of server errors on the mobile, and text partitions
+-- Generates a TSV of server errors on the text partition
 --
 -- Parameters:
 --     destination_directory -- Directory in HDFS where to store the generated
@@ -19,7 +19,7 @@
 --     hive -f generate_5xx_tsv.hql                \
 --         -d destination_directory=/tmp/foo       \
 --         -d webrequest_table=wmf_raw.webrequest  \
---         -d webrequest_sources="'mobile'"        \
+--         -d webrequest_sources="'text'"        \
 --         -d year=2014                            \
 --         -d month=4                              \
 --         -d day=1
diff --git a/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql 
b/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
index e054713..c31363a 100644
--- a/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
@@ -19,7 +19,7 @@
 --     hive -f generate_api-usage_tsv.hql          \
 --         -d destination_directory=/tmp/foo       \
 --         -d webrequest_table=wmf_raw.webrequest  \
---         -d webrequest_sources="'mobile'"        \
+--         -d webrequest_sources="'text'"        \
 --         -d year=2014                            \
 --         -d month=4                              \
 --         -d day=1
diff --git a/oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql 
b/oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
index 92592c1..ca82856 100644
--- a/oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
@@ -2,7 +2,7 @@
 SET 
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
 --^ To work around HIVE-3296, we have SETs before any comments
 
--- Generates a TSV for 1:1000 sampled requests for mobile, text, and upload
+-- Generates a TSV for 1:1000 sampled requests for text and upload
 --
 -- Parameters:
 --     destination_directory -- Directory in HDFS where to store the generated
@@ -19,7 +19,7 @@
 --     hive -f generate_sampled-1000_tsv.hql       \
 --         -d destination_directory=/tmp/foo       \
 --         -d webrequest_table=wmf_raw.webrequest  \
---         -d webrequest_sources="'mobile'"        \
+--         -d webrequest_sources="'text'"        \
 --         -d year=2014                            \
 --         -d month=4                              \
 --         -d day=1
diff --git a/oozie/webrequest/legacy_tsvs/workflow.xml 
b/oozie/webrequest/legacy_tsvs/workflow.xml
index ec91400..5bac9f6 100644
--- a/oozie/webrequest/legacy_tsvs/workflow.xml
+++ b/oozie/webrequest/legacy_tsvs/workflow.xml
@@ -120,11 +120,11 @@
             This variable holds quoted strings, separated by commas. So for
             example a coordinator path of
 
-              /path/to/coordinator_misc_mobile.xml
+              /path/to/coordinator_misc_text.xml
 
             will get turned into
 
-              'misc', 'mobile'
+              'misc', 'text'
 
             . This allows to use the variable straight in HivQL via
 
diff --git a/oozie/webrequest/load/bundle.xml b/oozie/webrequest/load/bundle.xml
index 17ac0fa..2075bfd 100644
--- a/oozie/webrequest/load/bundle.xml
+++ b/oozie/webrequest/load/bundle.xml
@@ -44,16 +44,6 @@
         </configuration>
     </coordinator>
 
-    <coordinator name="load_webrequest-mobile-coord">
-        <app-path>${coordinator_file}</app-path>
-        <configuration>
-            <property>
-                <name>webrequest_source</name>
-                <value>mobile</value>
-            </property>
-        </configuration>
-    </coordinator>
-
     <coordinator name="load_webrequest-text-coord">
         <app-path>${coordinator_file}</app-path>
         <configuration>
diff --git a/oozie/webrequest/refine/bundle.xml 
b/oozie/webrequest/refine/bundle.xml
index 3cde34b..cbd017e 100644
--- a/oozie/webrequest/refine/bundle.xml
+++ b/oozie/webrequest/refine/bundle.xml
@@ -48,16 +48,6 @@
         </configuration>
     </coordinator>
 
-    <coordinator name="refine_webrequest-mobile-coord">
-        <app-path>${coordinator_file}</app-path>
-        <configuration>
-            <property>
-                <name>webrequest_source</name>
-                <value>mobile</value>
-            </property>
-        </configuration>
-    </coordinator>
-
     <coordinator name="refine_webrequest-text-coord">
         <app-path>${coordinator_file}</app-path>
         <configuration>
diff --git a/python/refinery/util.py b/python/refinery/util.py
index ce9cf79..63a1f95 100755
--- a/python/refinery/util.py
+++ b/python/refinery/util.py
@@ -267,10 +267,10 @@
 
         Example:
             partition_spec_from_path(
-                
path='/wmf/data/raw/webrequest/webrequest_mobile/hourly/2014/05/14/23',
+                
path='/wmf/data/raw/webrequest/webrequest_text/hourly/2014/05/14/23',
                 
regex=r'/webrequest_(?P<webrequest_source>[^/]+)/hourly/(?P<year>[^/]+)/(?P<month>[^/]+)/(?P<day>[^/]+)/(?P<hour>[^/]+)'
             )
-            returns: 
'webrequest_source='mobile',year=2014,month=05,day=14,hour=23
+            returns: 
'webrequest_source='text',year=2014,month=05,day=14,hour=23
         """
         if isinstance(regex, basestring):
             regex = re.compile(regex)
@@ -312,7 +312,7 @@
 
         Example:
             partition_datetime_from_spec(
-                
spec='webrequest_source=\'mobile\',year=2014,month=05,day=14,hour=00',
+                
spec='webrequest_source=\'text\',year=2014,month=05,day=14,hour=00',
                 
regex=r'webrequest_source=(?P<webrequest_source>[^/,]+)[/,]year=(?P<year>[^/,]+)[/,]month=(?P<month>[^/,]+)[/,]day=(?P<day>[^/]+)[/,]hour=(?P<hour>[^/,]+)'
             )
             returns: datetime.datetime(2014, 5, 14, 23, 0)
diff --git a/python/tests/test_refinery/test_util.py 
b/python/tests/test_refinery/test_util.py
index 4b01381..b9e8e96 100644
--- a/python/tests/test_refinery/test_util.py
+++ b/python/tests/test_refinery/test_util.py
@@ -38,17 +38,17 @@
         self.table_info = {
             'table1': {
                 'location':             '/path/to/table1',
-                'partitions_desc':      
['webrequest_source=mobile/year=2013/month=10/day=01/hour=01', 
'webrequest_source=mobile/year=2013/month=10/day=01/hour=02'],
-                'partitions_spec':      
['webrequest_source=\'mobile\',year=2013,month=10,day=01,hour=01', 
'webrequest_source=\'mobile\',year=2013,month=10,day=01,hour=02'],
+                'partitions_desc':      
['webrequest_source=text/year=2013/month=10/day=01/hour=01', 
'webrequest_source=text/year=2013/month=10/day=01/hour=02'],
+                'partitions_spec':      
['webrequest_source=\'text\',year=2013,month=10,day=01,hour=01', 
'webrequest_source=\'text\',year=2013,month=10,day=01,hour=02'],
                 'partitions_datetime':  [datetime(2013,10,01,01), 
datetime(2013,10,01,02)],
-                'partitions_path':      
['/path/to/table1/webrequest_mobile/hourly/2013/10/01/01', 
'/path/to/table1/webrequest_mobile/hourly/2013/10/01/02'],
+                'partitions_path':      
['/path/to/table1/webrequest_text/hourly/2013/10/01/01', 
'/path/to/table1/webrequest_text/hourly/2013/10/01/02'],
             },
             'table2': {
                 'location':             '/path/to/table2',
-                'partitions_desc':      
['webrequest_source=mobile/year=2013/month=10/day=01/hour=01', 
'webrequest_source=mobile/year=2013/month=10/day=01/hour=02'],
-                'partitions_spec':      
['webrequest_source=\'mobile\',year=2013,month=10,day=01,hour=01', 
'webrequest_source=\'mobile\',year=2013,month=10,day=01,hour=02'],
+                'partitions_desc':      
['webrequest_source=text/year=2013/month=10/day=01/hour=01', 
'webrequest_source=text/year=2013/month=10/day=01/hour=02'],
+                'partitions_spec':      
['webrequest_source=\'text\',year=2013,month=10,day=01,hour=01', 
'webrequest_source=\'text\',year=2013,month=10,day=01,hour=02'],
                 'partitions_datetime':  [datetime(2013,10,01,01), 
datetime(2013,10,01,02)],
-                'partitions_path':      
['/path/to/table1/webrequest_source=mobile/year=2013/month=10/day=01/hour=01', 
'/path/to/table2/webrequest_source=mobile/year=2013/month=10/day=01/hour=02'],
+                'partitions_path':      
['/path/to/table1/webrequest_source=text/year=2013/month=10/day=01/hour=01', 
'/path/to/table2/webrequest_source=text/year=2013/month=10/day=01/hour=02'],
             },
         }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/264870
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I2c356adca2c4198d33b0c4c2eeb9d2df010e12cb
Gerrit-PatchSet: 3
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Ottomata <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Madhuvishy <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to