Ottomata has submitted this change and it was merged.
Change subject: Use misc when producing legacy tsvs
......................................................................
Use misc when producing legacy tsvs
Change-Id: I083e5a9d4e08117ec585f1fe9058b1effd635a16
---
M diagrams/oozie-overview.dia
M oozie/webrequest/legacy_tsvs/bundle.properties
M oozie/webrequest/legacy_tsvs/bundle.xml
A oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
M oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
5 files changed, 146 insertions(+), 3 deletions(-)
Approvals:
Ottomata: Verified; Looks good to me, approved
diff --git a/diagrams/oozie-overview.dia b/diagrams/oozie-overview.dia
index 2944843..1da1b88 100644
--- a/diagrams/oozie-overview.dia
+++ b/diagrams/oozie-overview.dia
Binary files differ
diff --git a/oozie/webrequest/legacy_tsvs/bundle.properties
b/oozie/webrequest/legacy_tsvs/bundle.properties
index 06b5c73..900fbd3 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.properties
+++ b/oozie/webrequest/legacy_tsvs/bundle.properties
@@ -21,6 +21,7 @@
# webrequest_sources they depend on. This allows to for example turn off upload
# and have the coordinators that depend on upload block, while the coordinators
# that do not depend on upload continue to run.
+coordinator_misc_mobile_text_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
coordinator_mobile_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile.xml
coordinator_mobile_text_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text.xml
coordinator_mobile_text_upload_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml
b/oozie/webrequest/legacy_tsvs/bundle.xml
index 7bff053..97979ad 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.xml
+++ b/oozie/webrequest/legacy_tsvs/bundle.xml
@@ -9,6 +9,7 @@
</property>
<!-- Required properties. -->
+ <property><name>coordinator_misc_mobile_text_file</name></property>
<property><name>coordinator_mobile_file</name></property>
<property><name>coordinator_mobile_text_file</name></property>
<property><name>coordinator_mobile_text_upload_file</name></property>
@@ -137,7 +138,7 @@
</coordinator>
<coordinator name="webrequest_legacy_tsvs-5xx">
- <app-path>${coordinator_mobile_text_file}</app-path>
+ <app-path>${coordinator_misc_mobile_text_file}</app-path>
<configuration>
<property>
<name>aspect_name</name>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
b/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
new file mode 100644
index 0000000..56fbb84
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
@@ -0,0 +1,142 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+ name="webrequest_legacy_tsvs-${aspect_name}-coord"
+ frequency="${coord:days(1)}"
+ start="${start_time}"
+ end="${stop_time}"
+ timezone="Universal">
+
+ <parameters>
+ <property>
+ <name>queue_name</name>
+ <value>default</value>
+ </property>
+
+ <!-- Required properties. -->
+ <property><name>name_node</name></property>
+ <property><name>job_tracker</name></property>
+ <property><name>start_time</name></property>
+ <property><name>hour_offset</name></property>
+ <property><name>stop_time</name></property>
+ <property><name>webrequest_datasets_file</name></property>
+ <property><name>webrequest_data_directory</name></property>
+ <property><name>hive_site_xml</name></property>
+ <property><name>workflow_file</name></property>
+ <property><name>webrequest_table</name></property>
+ <property><name>mark_directory_done_workflow_file</name></property>
+ <property><name>temporary_directory</name></property>
+ <property><name>aspect_tsv_archive_directory</name></property>
+ <property><name>archive_job_output_workflow_file</name></property>
+ <property><name>aspect_name</name></property>
+ </parameters>
+
+ <controls>
+ <!--
+ By having materialized jobs not timeout, we ease backfilling incidents
+ after recoverable hiccups on the dataset producers.
+ -->
+ <timeout>-1</timeout>
+
+ <!--
+ Since the job only runs daily, even low concurrency allows to catch up
+ pretty fast. Hence, we can limit concurrency to 1, as the tsvs
typically
+ process quite some data.
+ -->
+ <concurrency>1</concurrency>
+
+ <!--
+ In order to keep backfilling after an incident simple, we only start
+ throttling materialization after 4 days.
+ Due to the low concurrency, and low discrepancy between progressing
+ time, and expected availability of datasets, we should typically have
+ far less materialized jobs.
+ -->
+ <throttle>4</throttle>
+ </controls>
+
+ <datasets>
+ <include>${webrequest_datasets_file}</include>
+ </datasets>
+
+ <input-events>
+ <data-in name="webrequest_misc" dataset="webrequest_misc">
+ <start-instance>${coord:current(0)}</start-instance>
+ <end-instance>${coord:current(23)}</end-instance>
+ </data-in>
+
+ <data-in name="webrequest_mobile" dataset="webrequest_mobile">
+ <start-instance>${coord:current(0)}</start-instance>
+ <end-instance>${coord:current(23)}</end-instance>
+ </data-in>
+
+ <data-in name="webrequest_text" dataset="webrequest_text">
+ <start-instance>${coord:current(0)}</start-instance>
+ <end-instance>${coord:current(23)}</end-instance>
+ </data-in>
+
+ <!--
+ The following dataset is not required as input to the
+ workflow, but only helps to delay running it.
+
+ The 24 hours offset is for a full day. And we subtract 2 hours, as
+ webrequest processing starts 2 hours after the respective hour.
+ -->
+ <data-in name="delay" dataset="webrequest_text">
+ <instance>${coord:current(24-2+hour_offset)}</instance>
+ </data-in>
+ </input-events>
+
+ <action>
+ <workflow>
+ <app-path>${workflow_file}</app-path>
+ <configuration>
+
+ <!-- Pass these properties through to the workflow -->
+
<property><name>name_node</name><value>${name_node}</value></property>
+
<property><name>job_tracker</name><value>${job_tracker}</value></property>
+
<property><name>queue_name</name><value>${queue_name}</value></property>
+
+ <property>
+ <name>hive_site_xml</name>
+ <value>${hive_site_xml}</value>
+ </property>
+ <property>
+ <name>webrequest_table</name>
+ <value>${webrequest_table}</value>
+ </property>
+ <property>
+ <name>year</name>
+ <value>${coord:formatTime(coord:nominalTime(),
"yyyy")}</value>
+ </property>
+ <property>
+ <name>month</name>
+ <value>${coord:formatTime(coord:nominalTime(),
"MM")}</value>
+ </property>
+ <property>
+ <name>day</name>
+ <value>${coord:formatTime(coord:nominalTime(),
"dd")}</value>
+ </property>
+ <property>
+ <name>mark_directory_done_workflow_file</name>
+ <value>${mark_directory_done_workflow_file}</value>
+ </property>
+ <property>
+ <name>temporary_directory</name>
+ <value>${temporary_directory}</value>
+ </property>
+ <property>
+ <name>aspect_name</name>
+ <value>${aspect_name}</value>
+ </property>
+ <property>
+ <name>aspect_tsv_archive_directory</name>
+ <value>${aspect_tsv_archive_directory}</value>
+ </property>
+ <property>
+ <name>archive_job_output_workflow_file</name>
+ <value>${archive_job_output_workflow_file}</value>
+ </property>
+ </configuration>
+ </workflow>
+ </action>
+</coordinator-app>
diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
index 2990dee..ecbcadf 100644
--- a/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
@@ -56,9 +56,8 @@
) line,
dt
FROM ${webrequest_table}
- WHERE webrequest_source IN ('mobile', 'text')
+ WHERE webrequest_source IN ('misc', 'mobile', 'text')
-- TODO: Add 'bits', once it's turned on again
- -- TODO: Add 'misc', once it's available
AND year=${year}
AND month=${month}
AND day=${day}
--
To view, visit https://gerrit.wikimedia.org/r/186776
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I083e5a9d4e08117ec585f1fe9058b1effd635a16
Gerrit-PatchSet: 3
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits