Joal has submitted this change and it was merged.
Change subject: Remove mobile webrequest_source merging it in text
......................................................................
Remove mobile webrequest_source merging it in text
Bug: T122651
Change-Id: I2c356adca2c4198d33b0c4c2eeb9d2df010e12cb
---
M bin/refinery-dump-status-webrequest-partitions
M oozie/mobile_apps/session_metrics/coordinator.properties
M oozie/mobile_apps/session_metrics/coordinator.xml
M oozie/mobile_apps/uniques/daily/coordinator.xml
M oozie/mobile_apps/uniques/daily/generate_uniques_daily.hql
M oozie/mobile_apps/uniques/monthly/coordinator.xml
M oozie/mobile_apps/uniques/monthly/generate_uniques_monthly.hql
M oozie/pagecounts-all-sites/load/coordinator.xml
M oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
M oozie/pageview/hourly/coordinator.xml
M oozie/pageview/hourly/pageview_hourly.hql
M oozie/util/hive/partition/add/workflow.properties
M oozie/webrequest/datasets.xml
M oozie/webrequest/datasets_raw.xml
M oozie/webrequest/legacy_tsvs/bundle.properties
M oozie/webrequest/legacy_tsvs/bundle.xml
D oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
R oozie/webrequest/legacy_tsvs/coordinator_misc_text.xml
D oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
C oozie/webrequest/legacy_tsvs/coordinator_text_upload.xml
M oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
M oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
M oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
M oozie/webrequest/legacy_tsvs/workflow.xml
M oozie/webrequest/load/bundle.xml
M oozie/webrequest/refine/bundle.xml
M python/refinery/util.py
M python/tests/test_refinery/test_util.py
28 files changed, 51 insertions(+), 405 deletions(-)
Approvals:
Joal: Verified; Looks good to me, approved
diff --git a/bin/refinery-dump-status-webrequest-partitions
b/bin/refinery-dump-status-webrequest-partitions
index e5ea750..914c739 100755
--- a/bin/refinery-dump-status-webrequest-partitions
+++ b/bin/refinery-dump-status-webrequest-partitions
@@ -75,12 +75,12 @@
DATASET_VISIBILITIES["$DATASET"]=no
}
-add_dataset "legacy_tsvs" "daily" " 5xx | 5xx-misc |5xx-mobile |
5xx-text |5xx-upload | api | edits | glam_nara | mobile |
sampled | zero |"
+add_dataset "legacy_tsvs" "daily" " 5xx | 5xx-misc | 5xx-text
|5xx-upload | api | edits | glam_nara | sampled | zero |"
add_dataset "mediacounts" "daily" " full | top1000 |"
add_dataset "pagecounts_all_sites" "hourly" " file name date | page |
project |"
add_dataset "pagecounts_raw" "hourly" " file name date | page | project |"
-add_dataset "raw_webrequest" "hourly" " maps | misc | mobile
| text | upload |"
-add_dataset "webrequest" "hourly" " maps | misc | mobile | text | upload
|"
+add_dataset "raw_webrequest" "hourly" " maps | misc | text
| upload |"
+add_dataset "webrequest" "hourly" " maps | misc | text | upload |"
add_dataset "pageview" "hourly" " hourly |"
add_dataset "projectview" "hourly" " hourly |"
@@ -373,13 +373,11 @@
for BASE in \
5xx/5xx \
5xx-misc/5xx-misc \
- 5xx-mobile/5xx-mobile \
5xx-text/5xx-text \
5xx-upload/5xx-upload \
api/api-usage \
edits/edits \
glam_nara/glam_nara \
- mobile/mobile-sampled-100 \
sampled/sampled-1000 \
zero/zero \
@@ -492,7 +490,7 @@
local DATE_HDFS_PADDED="$(date --utc -d "$DATE" +'%Y/%m/%d/%H')"
- for SOURCE in maps misc mobile text upload
+ for SOURCE in maps misc text upload
do
log_no_lf " "
dump_dataset_raw_webrequest_partition "$DATE_HDFS_PADDED" "$SOURCE"
@@ -506,7 +504,7 @@
local DATE_DIRS_REL="$(date --utc -d "$DATE"
+'year=%Y/month=%m/day=%d/hour=%H')"
DATE_DIRS_REL="${DATE_DIRS_REL//=0/=}"
- for SOURCE in maps misc mobile text upload
+ for SOURCE in maps misc text upload
do
local STATUS="X"
SUCCESS_FILE_ABS="$WEBREQUEST_DATA_DIR_ABS/webrequest_source=$SOURCE/$DATE_DIRS_REL/_SUCCESS"
diff --git a/oozie/mobile_apps/session_metrics/coordinator.properties
b/oozie/mobile_apps/session_metrics/coordinator.properties
index 9aca326..4a43857 100644
--- a/oozie/mobile_apps/session_metrics/coordinator.properties
+++ b/oozie/mobile_apps/session_metrics/coordinator.properties
@@ -1,5 +1,5 @@
# Configures a coordinator to automatically manage generating app session
metrics from
-# the refined webrequest mobile data. Any of the following properties are
overidable with -D.
+# the refined webrequest text data. Any of the following properties are
overidable with -D.
# Usage:
# oozie job -Duser=$USER -Dstart_time=2015-05-01T00:00Z -submit -config
oozie/mobile_apps/session_metrics/coordinator.properties
#
diff --git a/oozie/mobile_apps/session_metrics/coordinator.xml
b/oozie/mobile_apps/session_metrics/coordinator.xml
index 197db0f..67593b8 100644
--- a/oozie/mobile_apps/session_metrics/coordinator.xml
+++ b/oozie/mobile_apps/session_metrics/coordinator.xml
@@ -50,7 +50,7 @@
</datasets>
<input-events>
- <data-in name="mobile" dataset="webrequest_mobile">
+ <data-in name="text" dataset="webrequest_text">
<!-- 30 days of data in hours -->
<start-instance>${coord:current(0)}</start-instance>
<end-instance>${coord:current(24 * 30 - 1)}</end-instance>
diff --git a/oozie/mobile_apps/uniques/daily/coordinator.xml
b/oozie/mobile_apps/uniques/daily/coordinator.xml
index d0877b4..7941239 100644
--- a/oozie/mobile_apps/uniques/daily/coordinator.xml
+++ b/oozie/mobile_apps/uniques/daily/coordinator.xml
@@ -40,13 +40,9 @@
<input-events>
<!--
- Please see datasets definition, the webrequest_mobile
- and webrequest_text are refined datasets from the raw data.
+ Please see datasets definition webrequest_text is a
+ refined dataset from the raw data.
-->
- <data-in name="mobile" dataset="webrequest_mobile">
- <start-instance>${coord:current(0)}</start-instance>
- <end-instance>${coord:current(23)}</end-instance>
- </data-in>
<data-in name="text" dataset="webrequest_text">
<start-instance>${coord:current(0)}</start-instance>
<end-instance>${coord:current(23)}</end-instance>
diff --git a/oozie/mobile_apps/uniques/daily/generate_uniques_daily.hql
b/oozie/mobile_apps/uniques/daily/generate_uniques_daily.hql
index 4022c17..ba32fa6 100644
--- a/oozie/mobile_apps/uniques/daily/generate_uniques_daily.hql
+++ b/oozie/mobile_apps/uniques/daily/generate_uniques_daily.hql
@@ -60,14 +60,14 @@
month,
day,
CASE WHEN user_agent LIKE('%iPhone%') THEN 'iOS' ELSE 'Android' END AS
platform,
- COALESCE(x_analytics_map['wmfuuid'],
+ COALESCE(x_analytics_map['wmfuuid'],
parse_url(concat('http://bla.org/woo/', uri_query), 'QUERY',
'appInstallID')) AS uuid
FROM ${source_table}
WHERE user_agent LIKE('WikipediaApp%')
AND parse_url(concat('http://bla.org/woo/', uri_query), 'QUERY',
'action') = 'mobileview'
- AND COALESCE(x_analytics_map['wmfuuid'],
+ AND COALESCE(x_analytics_map['wmfuuid'],
parse_url(concat('http://bla.org/woo/', uri_query),
'QUERY', 'appInstallID')) IS NOT NULL
- AND webrequest_source IN ('mobile','text')
+ AND webrequest_source IN ('text')
AND year=${year}
AND month=${month}
AND day=${day}
diff --git a/oozie/mobile_apps/uniques/monthly/coordinator.xml
b/oozie/mobile_apps/uniques/monthly/coordinator.xml
index e2a7b55..bc515ae 100644
--- a/oozie/mobile_apps/uniques/monthly/coordinator.xml
+++ b/oozie/mobile_apps/uniques/monthly/coordinator.xml
@@ -42,13 +42,9 @@
<input-events>
<!--
- Please see datasets definition, the webrequest_mobile
- and webrequest_text are refined datasets from the raw data.
+ Please see datasets definition webrequest_text is a
+ refined dataset from the raw data.
-->
- <data-in name="mobile" dataset="webrequest_mobile">
- <start-instance>${coord:current(0)}</start-instance>
- <end-instance>${coord:current(coord:daysInMonth(0) * 24 -
1)}</end-instance>
- </data-in>
<data-in name="text" dataset="webrequest_text">
<start-instance>${coord:current(0)}</start-instance>
<end-instance>${coord:current(coord:daysInMonth(0) * 24 -
1)}</end-instance>
diff --git a/oozie/mobile_apps/uniques/monthly/generate_uniques_monthly.hql
b/oozie/mobile_apps/uniques/monthly/generate_uniques_monthly.hql
index 566e48a..1d263c0 100644
--- a/oozie/mobile_apps/uniques/monthly/generate_uniques_monthly.hql
+++ b/oozie/mobile_apps/uniques/monthly/generate_uniques_monthly.hql
@@ -57,14 +57,14 @@
year,
month,
CASE WHEN user_agent LIKE('%iPhone%') THEN 'iOS' ELSE 'Android' END AS
platform,
- COALESCE(x_analytics_map['wmfuuid'],
+ COALESCE(x_analytics_map['wmfuuid'],
parse_url(concat('http://bla.org/woo/', uri_query), 'QUERY',
'appInstallID')) AS uuid
FROM ${source_table}
WHERE user_agent LIKE('WikipediaApp%')
AND parse_url(concat('http://bla.org/woo/', uri_query), 'QUERY',
'action') = 'mobileview'
- AND COALESCE(x_analytics_map['wmfuuid'],
+ AND COALESCE(x_analytics_map['wmfuuid'],
parse_url(concat('http://bla.org/woo/', uri_query),
'QUERY', 'appInstallID')) IS NOT NULL
- AND webrequest_source IN ('mobile','text')
+ AND webrequest_source IN ('text')
AND year=${year}
AND month=${month}
)
diff --git a/oozie/pagecounts-all-sites/load/coordinator.xml
b/oozie/pagecounts-all-sites/load/coordinator.xml
index 0ea0317..b570fd6 100644
--- a/oozie/pagecounts-all-sites/load/coordinator.xml
+++ b/oozie/pagecounts-all-sites/load/coordinator.xml
@@ -59,9 +59,6 @@
<data-in name="input_text" dataset="webrequest_text">
<instance>${coord:current(0)}</instance>
</data-in>
- <data-in name="input_mobile" dataset="webrequest_mobile">
- <instance>${coord:current(0)}</instance>
- </data-in>
</input-events>
<output-events>
diff --git a/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
b/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
index cf41277..0fc9a02 100644
--- a/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
+++ b/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql
@@ -44,7 +44,7 @@
response_size
FROM ${source_table}
WHERE
- webrequest_source IN ('text', 'mobile')
+ webrequest_source IN ('text')
AND year=${year}
AND month=${month}
AND day=${day}
diff --git a/oozie/pageview/hourly/coordinator.xml
b/oozie/pageview/hourly/coordinator.xml
index cee8305..f79a945 100644
--- a/oozie/pageview/hourly/coordinator.xml
+++ b/oozie/pageview/hourly/coordinator.xml
@@ -79,9 +79,6 @@
</datasets>
<input-events>
- <data-in name="mobile_refined_input" dataset="webrequest_mobile">
- <instance>${coord:current(0)}</instance>
- </data-in>
<data-in name="text_refined_input" dataset="webrequest_text">
<instance>${coord:current(0)}</instance>
</data-in>
diff --git a/oozie/pageview/hourly/pageview_hourly.hql
b/oozie/pageview/hourly/pageview_hourly.hql
index 95749c7..017d8eb 100644
--- a/oozie/pageview/hourly/pageview_hourly.hql
+++ b/oozie/pageview/hourly/pageview_hourly.hql
@@ -49,7 +49,7 @@
page_id
FROM
${source_table}
- WHERE webrequest_source IN ('text', 'mobile') AND
+ WHERE webrequest_source IN ('text') AND
year=${year} AND month=${month} AND day=${day} AND hour=${hour}
AND is_pageview = TRUE
AND COALESCE(pageview_info['project'], '') != ''
diff --git a/oozie/util/hive/partition/add/workflow.properties
b/oozie/util/hive/partition/add/workflow.properties
index f210622..116da72 100644
--- a/oozie/util/hive/partition/add/workflow.properties
+++ b/oozie/util/hive/partition/add/workflow.properties
@@ -2,8 +2,8 @@
# Any of the following properties are overidable with -D. Some properties
# are required to be set via the CLI: 'location' and 'partition_spec'.
#
-#
-Dlocation="hdfs://namenode.example.org:8020/path/to/data/directory/webrequest/webrequest_mobile/2014/04/02/01"
-#
-Dpartition_spec="webrequest_source='mobile',year=2014,month=04,day=02,hour=01"
+#
-Dlocation="hdfs://namenode.example.org:8020/path/to/data/directory/webrequest/webrequest_text/2014/04/02/01"
+# -Dpartition_spec="webrequest_source='text',year=2014,month=04,day=02,hour=01"
name_node = hdfs://analytics-hadoop
diff --git a/oozie/webrequest/datasets.xml b/oozie/webrequest/datasets.xml
index 81e77a4..3925db3 100644
--- a/oozie/webrequest/datasets.xml
+++ b/oozie/webrequest/datasets.xml
@@ -44,14 +44,6 @@
<done-flag>_SUCCESS</done-flag>
</dataset>
- <dataset name="webrequest_mobile"
- frequency="${coord:hours(1)}"
- initial-instance="${start_time}"
- timezone="Universal">
-
<uri-template>${webrequest_data_directory}/webrequest_source=mobile/year=${YEAR}/month=${"$"}{MONTH
+ 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
- <done-flag>_SUCCESS</done-flag>
- </dataset>
-
<dataset name="webrequest_text"
frequency="${coord:hours(1)}"
initial-instance="${start_time}"
diff --git a/oozie/webrequest/datasets_raw.xml
b/oozie/webrequest/datasets_raw.xml
index bc9e872..f8580ab 100644
--- a/oozie/webrequest/datasets_raw.xml
+++ b/oozie/webrequest/datasets_raw.xml
@@ -29,13 +29,6 @@
<uri-template>${webrequest_raw_data_directory}/webrequest_misc/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
<done-flag>_IMPORTED</done-flag>
</dataset>
- <dataset name="webrequest_mobile_raw_unchecked"
- frequency="${coord:hours(1)}"
- initial-instance="${start_time}"
- timezone="Universal">
-
<uri-template>${webrequest_raw_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
- <done-flag>_IMPORTED</done-flag>
- </dataset>
<dataset name="webrequest_text_raw_unchecked"
frequency="${coord:hours(1)}"
initial-instance="${start_time}"
@@ -69,13 +62,6 @@
initial-instance="${start_time}"
timezone="Universal">
<uri-template>${webrequest_raw_data_directory}/webrequest_misc/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
- <done-flag>_PARTITIONED</done-flag>
- </dataset>
- <dataset name="webrequest_mobile_raw_partitioned"
- frequency="${coord:hours(1)}"
- initial-instance="${start_time}"
- timezone="Universal">
-
<uri-template>${webrequest_raw_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
<done-flag>_PARTITIONED</done-flag>
</dataset>
<dataset name="webrequest_text_raw_partitioned"
@@ -114,13 +100,6 @@
initial-instance="${start_time}"
timezone="Universal">
<uri-template>${webrequest_raw_data_directory}/webrequest_misc/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
- <done-flag>_SUCCESS</done-flag>
- </dataset>
- <dataset name="webrequest_mobile_raw"
- frequency="${coord:hours(1)}"
- initial-instance="${start_time}"
- timezone="Universal">
-
<uri-template>${webrequest_raw_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
<done-flag>_SUCCESS</done-flag>
</dataset>
<dataset name="webrequest_text_raw"
diff --git a/oozie/webrequest/legacy_tsvs/bundle.properties
b/oozie/webrequest/legacy_tsvs/bundle.properties
index 9f13559..50269ae 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.properties
+++ b/oozie/webrequest/legacy_tsvs/bundle.properties
@@ -33,9 +33,8 @@
# and have the coordinators that depend on upload block, while the coordinators
# that do not depend on upload continue to run.
coordinator_misc_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc.xml
-coordinator_misc_mobile_text_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
-coordinator_mobile_text_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text.xml
-coordinator_mobile_text_upload_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
+coordinator_misc_text_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc_text.xml
+coordinator_text_upload_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_text_upload.xml
coordinator_text_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_text.xml
coordinator_upload_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_upload.xml
diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml
b/oozie/webrequest/legacy_tsvs/bundle.xml
index dad44fd..654bfde 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.xml
+++ b/oozie/webrequest/legacy_tsvs/bundle.xml
@@ -7,9 +7,8 @@
<!-- Required properties -->
<property><name>queue_name</name></property>
<property><name>coordinator_misc_file</name></property>
- <property><name>coordinator_misc_mobile_text_file</name></property>
- <property><name>coordinator_mobile_text_file</name></property>
- <property><name>coordinator_mobile_text_upload_file</name></property>
+ <property><name>coordinator_misc_text_file</name></property>
+ <property><name>coordinator_text_upload_file</name></property>
<property><name>coordinator_text_file</name></property>
<property><name>coordinator_upload_file</name></property>
<property><name>name_node</name></property>
@@ -33,7 +32,7 @@
No 'misc', as the sampled-1000 was last produced on erbium's
udp2log stream, which did not receive those two webrequest_sources.
-->
- <app-path>${coordinator_mobile_text_upload_file}</app-path>
+ <app-path>${coordinator_text_upload_file}</app-path>
<configuration>
<property>
<name>aspect_name</name>
@@ -51,7 +50,7 @@
</coordinator>
<coordinator name="webrequest_legacy_tsvs-api-usage">
- <app-path>${coordinator_mobile_text_file}</app-path>
+ <app-path>${coordinator_text_file}</app-path>
<configuration>
<property>
<name>aspect_name</name>
@@ -69,7 +68,7 @@
</coordinator>
<coordinator name="webrequest_legacy_tsvs-glam_nara">
- <app-path>${coordinator_mobile_text_upload_file}</app-path>
+ <app-path>${coordinator_text_upload_file}</app-path>
<configuration>
<property>
<name>aspect_name</name>
@@ -87,7 +86,7 @@
</coordinator>
<coordinator name="webrequest_legacy_tsvs-edits">
- <app-path>${coordinator_mobile_text_file}</app-path>
+ <app-path>${coordinator_text_file}</app-path>
<configuration>
<property>
<name>aspect_name</name>
@@ -109,7 +108,7 @@
No 'upload', as that explicitly got excluded in the upd2log
filters.
-->
- <app-path>${coordinator_misc_mobile_text_file}</app-path>
+ <app-path>${coordinator_misc_text_file}</app-path>
<configuration>
<property>
<name>aspect_name</name>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
b/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
deleted file mode 100644
index 6ec1bfc..0000000
--- a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
+++ /dev/null
@@ -1,145 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<coordinator-app xmlns="uri:oozie:coordinator:0.4"
- name="webrequest_legacy_tsvs-${aspect_name}-coord"
- frequency="${coord:days(1)}"
- start="${start_time}"
- end="${stop_time}"
- timezone="Universal">
-
- <parameters>
-
- <!-- Required properties -->
- <property><name>queue_name</name></property>
- <property><name>name_node</name></property>
- <property><name>job_tracker</name></property>
- <property><name>start_time</name></property>
- <property><name>hour_offset</name></property>
- <property><name>stop_time</name></property>
- <property><name>webrequest_datasets_file</name></property>
- <property><name>webrequest_data_directory</name></property>
- <property><name>hive_site_xml</name></property>
- <property><name>artifacts_directory</name></property>
- <property><name>workflow_file</name></property>
- <property><name>webrequest_table</name></property>
- <property><name>mark_directory_done_workflow_file</name></property>
- <property><name>temporary_directory</name></property>
- <property><name>aspect_tsv_archive_directory</name></property>
- <property><name>archive_job_output_workflow_file</name></property>
- <property><name>aspect_name</name></property>
- </parameters>
-
- <controls>
- <!--
- By having materialized jobs not timeout, we ease backfilling incidents
- after recoverable hiccups on the dataset producers.
- -->
- <timeout>-1</timeout>
-
- <!--
- Since the job only runs daily, even low concurrency allows to catch up
- pretty fast. Hence, we can limit concurrency to 1, as the tsvs
typically
- process quite some data.
- -->
- <concurrency>1</concurrency>
-
- <!--
- In order to keep backfilling after an incident simple, we only start
- throttling materialization after 4 days.
- Due to the low concurrency, and low discrepancy between progressing
- time, and expected availability of datasets, we should typically have
- far less materialized jobs.
- -->
- <throttle>4</throttle>
- </controls>
-
- <datasets>
- <include>${webrequest_datasets_file}</include>
- </datasets>
-
- <input-events>
-
- <data-in name="webrequest_misc" dataset="webrequest_misc">
- <start-instance>${coord:current(0)}</start-instance>
- <end-instance>${coord:current(23)}</end-instance>
- </data-in>
-
- <data-in name="webrequest_mobile" dataset="webrequest_mobile">
- <start-instance>${coord:current(0)}</start-instance>
- <end-instance>${coord:current(23)}</end-instance>
- </data-in>
-
- <data-in name="webrequest_text" dataset="webrequest_text">
- <start-instance>${coord:current(0)}</start-instance>
- <end-instance>${coord:current(23)}</end-instance>
- </data-in>
-
- <!--
- The following dataset is not required as input to the
- workflow, but only helps to delay running it.
-
- The 24 hours offset is for a full day. And we subtract 2 hours, as
- webrequest processing starts 2 hours after the respective hour.
- -->
- <data-in name="delay" dataset="webrequest_text">
- <instance>${coord:current(24-2+hour_offset)}</instance>
- </data-in>
- </input-events>
-
- <action>
- <workflow>
- <app-path>${workflow_file}</app-path>
- <configuration>
-
- <!-- Pass these properties through to the workflow -->
-
<property><name>name_node</name><value>${name_node}</value></property>
-
<property><name>job_tracker</name><value>${job_tracker}</value></property>
-
<property><name>queue_name</name><value>${queue_name}</value></property>
-
- <property>
- <name>hive_site_xml</name>
- <value>${hive_site_xml}</value>
- </property>
- <property>
- <name>artifacts_directory</name>
- <value>${artifacts_directory}</value>
- </property>
- <property>
- <name>webrequest_table</name>
- <value>${webrequest_table}</value>
- </property>
- <property>
- <name>year</name>
- <value>${coord:formatTime(coord:nominalTime(),
"yyyy")}</value>
- </property>
- <property>
- <name>month</name>
- <value>${coord:formatTime(coord:nominalTime(),
"MM")}</value>
- </property>
- <property>
- <name>day</name>
- <value>${coord:formatTime(coord:nominalTime(),
"dd")}</value>
- </property>
- <property>
- <name>mark_directory_done_workflow_file</name>
- <value>${mark_directory_done_workflow_file}</value>
- </property>
- <property>
- <name>temporary_directory</name>
- <value>${temporary_directory}</value>
- </property>
- <property>
- <name>aspect_name</name>
- <value>${aspect_name}</value>
- </property>
- <property>
- <name>aspect_tsv_archive_directory</name>
- <value>${aspect_tsv_archive_directory}</value>
- </property>
- <property>
- <name>archive_job_output_workflow_file</name>
- <value>${archive_job_output_workflow_file}</value>
- </property>
- </configuration>
- </workflow>
- </action>
-</coordinator-app>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
b/oozie/webrequest/legacy_tsvs/coordinator_misc_text.xml
similarity index 98%
rename from oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
rename to oozie/webrequest/legacy_tsvs/coordinator_misc_text.xml
index 323fed6..6803d89 100644
--- a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
+++ b/oozie/webrequest/legacy_tsvs/coordinator_misc_text.xml
@@ -57,7 +57,8 @@
</datasets>
<input-events>
- <data-in name="webrequest_mobile" dataset="webrequest_mobile">
+
+ <data-in name="webrequest_misc" dataset="webrequest_misc">
<start-instance>${coord:current(0)}</start-instance>
<end-instance>${coord:current(23)}</end-instance>
</data-in>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
b/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
deleted file mode 100644
index 3347b56..0000000
--- a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
+++ /dev/null
@@ -1,144 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<coordinator-app xmlns="uri:oozie:coordinator:0.4"
- name="webrequest_legacy_tsvs-${aspect_name}-coord"
- frequency="${coord:days(1)}"
- start="${start_time}"
- end="${stop_time}"
- timezone="Universal">
-
- <parameters>
-
- <!-- Required properties -->
- <property><name>queue_name</name></property>
- <property><name>name_node</name></property>
- <property><name>job_tracker</name></property>
- <property><name>start_time</name></property>
- <property><name>hour_offset</name></property>
- <property><name>stop_time</name></property>
- <property><name>webrequest_datasets_file</name></property>
- <property><name>webrequest_data_directory</name></property>
- <property><name>hive_site_xml</name></property>
- <property><name>artifacts_directory</name></property>
- <property><name>workflow_file</name></property>
- <property><name>webrequest_table</name></property>
- <property><name>mark_directory_done_workflow_file</name></property>
- <property><name>temporary_directory</name></property>
- <property><name>aspect_tsv_archive_directory</name></property>
- <property><name>archive_job_output_workflow_file</name></property>
- <property><name>aspect_name</name></property>
- </parameters>
-
- <controls>
- <!--
- By having materialized jobs not timeout, we ease backfilling incidents
- after recoverable hiccups on the dataset producers.
- -->
- <timeout>-1</timeout>
-
- <!--
- Since the job only runs daily, even low concurrency allows to catch up
- pretty fast. Hence, we can limit concurrency to 1, as the tsvs
typically
- process quite some data.
- -->
- <concurrency>1</concurrency>
-
- <!--
- In order to keep backfilling after an incident simple, we only start
- throttling materialization after 4 days.
- Due to the low concurrency, and low discrepancy between progressing
- time, and expected availability of datasets, we should typically have
- far less materialized jobs.
- -->
- <throttle>4</throttle>
- </controls>
-
- <datasets>
- <include>${webrequest_datasets_file}</include>
- </datasets>
-
- <input-events>
- <data-in name="webrequest_mobile" dataset="webrequest_mobile">
- <start-instance>${coord:current(0)}</start-instance>
- <end-instance>${coord:current(23)}</end-instance>
- </data-in>
-
- <data-in name="webrequest_text" dataset="webrequest_text">
- <start-instance>${coord:current(0)}</start-instance>
- <end-instance>${coord:current(23)}</end-instance>
- </data-in>
-
- <data-in name="webrequest_upload" dataset="webrequest_upload">
- <start-instance>${coord:current(0)}</start-instance>
- <end-instance>${coord:current(23)}</end-instance>
- </data-in>
-
- <!--
- The following dataset is not required as input to the
- workflow, but only helps to delay running it.
-
- The 24 hours offset is for a full day. And we subtract 2 hours, as
- webrequest processing starts 2 hours after the respective hour.
- -->
- <data-in name="delay" dataset="webrequest_text">
- <instance>${coord:current(24-2+hour_offset)}</instance>
- </data-in>
- </input-events>
-
- <action>
- <workflow>
- <app-path>${workflow_file}</app-path>
- <configuration>
-
- <!-- Pass these properties through to the workflow -->
-
<property><name>name_node</name><value>${name_node}</value></property>
-
<property><name>job_tracker</name><value>${job_tracker}</value></property>
-
<property><name>queue_name</name><value>${queue_name}</value></property>
-
- <property>
- <name>hive_site_xml</name>
- <value>${hive_site_xml}</value>
- </property>
- <property>
- <name>artifacts_directory</name>
- <value>${artifacts_directory}</value>
- </property>
- <property>
- <name>webrequest_table</name>
- <value>${webrequest_table}</value>
- </property>
- <property>
- <name>year</name>
- <value>${coord:formatTime(coord:nominalTime(),
"yyyy")}</value>
- </property>
- <property>
- <name>month</name>
- <value>${coord:formatTime(coord:nominalTime(),
"MM")}</value>
- </property>
- <property>
- <name>day</name>
- <value>${coord:formatTime(coord:nominalTime(),
"dd")}</value>
- </property>
- <property>
- <name>mark_directory_done_workflow_file</name>
- <value>${mark_directory_done_workflow_file}</value>
- </property>
- <property>
- <name>temporary_directory</name>
- <value>${temporary_directory}</value>
- </property>
- <property>
- <name>aspect_name</name>
- <value>${aspect_name}</value>
- </property>
- <property>
- <name>aspect_tsv_archive_directory</name>
- <value>${aspect_tsv_archive_directory}</value>
- </property>
- <property>
- <name>archive_job_output_workflow_file</name>
- <value>${archive_job_output_workflow_file}</value>
- </property>
- </configuration>
- </workflow>
- </action>
-</coordinator-app>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
b/oozie/webrequest/legacy_tsvs/coordinator_text_upload.xml
similarity index 98%
copy from oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
copy to oozie/webrequest/legacy_tsvs/coordinator_text_upload.xml
index 323fed6..988ba89 100644
--- a/oozie/webrequest/legacy_tsvs/coordinator_mobile_text.xml
+++ b/oozie/webrequest/legacy_tsvs/coordinator_text_upload.xml
@@ -57,12 +57,13 @@
</datasets>
<input-events>
- <data-in name="webrequest_mobile" dataset="webrequest_mobile">
+
+ <data-in name="webrequest_text" dataset="webrequest_text">
<start-instance>${coord:current(0)}</start-instance>
<end-instance>${coord:current(23)}</end-instance>
</data-in>
- <data-in name="webrequest_text" dataset="webrequest_text">
+ <data-in name="webrequest_upload" dataset="webrequest_upload">
<start-instance>${coord:current(0)}</start-instance>
<end-instance>${coord:current(23)}</end-instance>
</data-in>
diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
index 69d5651..35f971c 100644
--- a/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
@@ -2,7 +2,7 @@
SET
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
--^ To work around HIVE-3296, we have SETs before any comments
--- Generates a TSV of server errors on the mobile, and text partitions
+-- Generates a TSV of server errors on the text partition
--
-- Parameters:
-- destination_directory -- Directory in HDFS where to store the generated
@@ -19,7 +19,7 @@
-- hive -f generate_5xx_tsv.hql \
-- -d destination_directory=/tmp/foo \
-- -d webrequest_table=wmf_raw.webrequest \
--- -d webrequest_sources="'mobile'" \
+-- -d webrequest_sources="'text'" \
-- -d year=2014 \
-- -d month=4 \
-- -d day=1
diff --git a/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
index e054713..c31363a 100644
--- a/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_api-usage_tsv.hql
@@ -19,7 +19,7 @@
-- hive -f generate_api-usage_tsv.hql \
-- -d destination_directory=/tmp/foo \
-- -d webrequest_table=wmf_raw.webrequest \
--- -d webrequest_sources="'mobile'" \
+-- -d webrequest_sources="'text'" \
-- -d year=2014 \
-- -d month=4 \
-- -d day=1
diff --git a/oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
index 92592c1..ca82856 100644
--- a/oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_sampled-1000_tsv.hql
@@ -2,7 +2,7 @@
SET
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
--^ To work around HIVE-3296, we have SETs before any comments
--- Generates a TSV for 1:1000 sampled requests for mobile, text, and upload
+-- Generates a TSV for 1:1000 sampled requests for text and upload
--
-- Parameters:
-- destination_directory -- Directory in HDFS where to store the generated
@@ -19,7 +19,7 @@
-- hive -f generate_sampled-1000_tsv.hql \
-- -d destination_directory=/tmp/foo \
-- -d webrequest_table=wmf_raw.webrequest \
--- -d webrequest_sources="'mobile'" \
+-- -d webrequest_sources="'text'" \
-- -d year=2014 \
-- -d month=4 \
-- -d day=1
diff --git a/oozie/webrequest/legacy_tsvs/workflow.xml
b/oozie/webrequest/legacy_tsvs/workflow.xml
index ec91400..5bac9f6 100644
--- a/oozie/webrequest/legacy_tsvs/workflow.xml
+++ b/oozie/webrequest/legacy_tsvs/workflow.xml
@@ -120,11 +120,11 @@
This variable holds quoted strings, separated by commas. So for
example a coordinator path of
- /path/to/coordinator_misc_mobile.xml
+ /path/to/coordinator_misc_text.xml
will get turned into
- 'misc', 'mobile'
+ 'misc', 'text'
. This allows to use the variable straight in HivQL via
diff --git a/oozie/webrequest/load/bundle.xml b/oozie/webrequest/load/bundle.xml
index 17ac0fa..2075bfd 100644
--- a/oozie/webrequest/load/bundle.xml
+++ b/oozie/webrequest/load/bundle.xml
@@ -44,16 +44,6 @@
</configuration>
</coordinator>
- <coordinator name="load_webrequest-mobile-coord">
- <app-path>${coordinator_file}</app-path>
- <configuration>
- <property>
- <name>webrequest_source</name>
- <value>mobile</value>
- </property>
- </configuration>
- </coordinator>
-
<coordinator name="load_webrequest-text-coord">
<app-path>${coordinator_file}</app-path>
<configuration>
diff --git a/oozie/webrequest/refine/bundle.xml
b/oozie/webrequest/refine/bundle.xml
index 3cde34b..cbd017e 100644
--- a/oozie/webrequest/refine/bundle.xml
+++ b/oozie/webrequest/refine/bundle.xml
@@ -48,16 +48,6 @@
</configuration>
</coordinator>
- <coordinator name="refine_webrequest-mobile-coord">
- <app-path>${coordinator_file}</app-path>
- <configuration>
- <property>
- <name>webrequest_source</name>
- <value>mobile</value>
- </property>
- </configuration>
- </coordinator>
-
<coordinator name="refine_webrequest-text-coord">
<app-path>${coordinator_file}</app-path>
<configuration>
diff --git a/python/refinery/util.py b/python/refinery/util.py
index ce9cf79..63a1f95 100755
--- a/python/refinery/util.py
+++ b/python/refinery/util.py
@@ -267,10 +267,10 @@
Example:
partition_spec_from_path(
-
path='/wmf/data/raw/webrequest/webrequest_mobile/hourly/2014/05/14/23',
+
path='/wmf/data/raw/webrequest/webrequest_text/hourly/2014/05/14/23',
regex=r'/webrequest_(?P<webrequest_source>[^/]+)/hourly/(?P<year>[^/]+)/(?P<month>[^/]+)/(?P<day>[^/]+)/(?P<hour>[^/]+)'
)
- returns:
'webrequest_source='mobile',year=2014,month=05,day=14,hour=23
+ returns:
'webrequest_source='text',year=2014,month=05,day=14,hour=23
"""
if isinstance(regex, basestring):
regex = re.compile(regex)
@@ -312,7 +312,7 @@
Example:
partition_datetime_from_spec(
-
spec='webrequest_source=\'mobile\',year=2014,month=05,day=14,hour=00',
+
spec='webrequest_source=\'text\',year=2014,month=05,day=14,hour=00',
regex=r'webrequest_source=(?P<webrequest_source>[^/,]+)[/,]year=(?P<year>[^/,]+)[/,]month=(?P<month>[^/,]+)[/,]day=(?P<day>[^/]+)[/,]hour=(?P<hour>[^/,]+)'
)
returns: datetime.datetime(2014, 5, 14, 23, 0)
diff --git a/python/tests/test_refinery/test_util.py
b/python/tests/test_refinery/test_util.py
index 4b01381..b9e8e96 100644
--- a/python/tests/test_refinery/test_util.py
+++ b/python/tests/test_refinery/test_util.py
@@ -38,17 +38,17 @@
self.table_info = {
'table1': {
'location': '/path/to/table1',
- 'partitions_desc':
['webrequest_source=mobile/year=2013/month=10/day=01/hour=01',
'webrequest_source=mobile/year=2013/month=10/day=01/hour=02'],
- 'partitions_spec':
['webrequest_source=\'mobile\',year=2013,month=10,day=01,hour=01',
'webrequest_source=\'mobile\',year=2013,month=10,day=01,hour=02'],
+ 'partitions_desc':
['webrequest_source=text/year=2013/month=10/day=01/hour=01',
'webrequest_source=text/year=2013/month=10/day=01/hour=02'],
+ 'partitions_spec':
['webrequest_source=\'text\',year=2013,month=10,day=01,hour=01',
'webrequest_source=\'text\',year=2013,month=10,day=01,hour=02'],
'partitions_datetime': [datetime(2013,10,01,01),
datetime(2013,10,01,02)],
- 'partitions_path':
['/path/to/table1/webrequest_mobile/hourly/2013/10/01/01',
'/path/to/table1/webrequest_mobile/hourly/2013/10/01/02'],
+ 'partitions_path':
['/path/to/table1/webrequest_text/hourly/2013/10/01/01',
'/path/to/table1/webrequest_text/hourly/2013/10/01/02'],
},
'table2': {
'location': '/path/to/table2',
- 'partitions_desc':
['webrequest_source=mobile/year=2013/month=10/day=01/hour=01',
'webrequest_source=mobile/year=2013/month=10/day=01/hour=02'],
- 'partitions_spec':
['webrequest_source=\'mobile\',year=2013,month=10,day=01,hour=01',
'webrequest_source=\'mobile\',year=2013,month=10,day=01,hour=02'],
+ 'partitions_desc':
['webrequest_source=text/year=2013/month=10/day=01/hour=01',
'webrequest_source=text/year=2013/month=10/day=01/hour=02'],
+ 'partitions_spec':
['webrequest_source=\'text\',year=2013,month=10,day=01,hour=01',
'webrequest_source=\'text\',year=2013,month=10,day=01,hour=02'],
'partitions_datetime': [datetime(2013,10,01,01),
datetime(2013,10,01,02)],
- 'partitions_path':
['/path/to/table1/webrequest_source=mobile/year=2013/month=10/day=01/hour=01',
'/path/to/table2/webrequest_source=mobile/year=2013/month=10/day=01/hour=02'],
+ 'partitions_path':
['/path/to/table1/webrequest_source=text/year=2013/month=10/day=01/hour=01',
'/path/to/table2/webrequest_source=text/year=2013/month=10/day=01/hour=02'],
},
}
--
To view, visit https://gerrit.wikimedia.org/r/264870
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I2c356adca2c4198d33b0c4c2eeb9d2df010e12cb
Gerrit-PatchSet: 3
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Ottomata <[email protected]>
Gerrit-Reviewer: Elukey <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Madhuvishy <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits