Hello Ottomata,

I'd like you to do a code review.  Please visit

    https://gerrit.wikimedia.org/r/189219

to review the following change.

Change subject: Add per webrequest_source 5xx tsvs to legacy_tsvs
......................................................................

Add per webrequest_source 5xx tsvs to legacy_tsvs

Ops called out that they would not want to see results from the misc
cluster when grepping for production issues, and this probably was an
oversight to have them in the udp2log tsvs. But they'd also voiced
that generally, having 5xx for misc might be useful.

As filtering tsvs for their webrequest_sources is generally hard, we
split all of them out into separate files. That way, Ops can grep for
production issues without having to deal with misc. They for example
can easily focus on only text or only mobile. And they also have
prepared (separate) files for misc's 5xx.

Change-Id: Id2adfa9ddb60789210a36b5472878890de580ace
---
M oozie/webrequest/legacy_tsvs/bundle.properties
M oozie/webrequest/legacy_tsvs/bundle.xml
A oozie/webrequest/legacy_tsvs/coordinator_misc.xml
A oozie/webrequest/legacy_tsvs/coordinator_text.xml
A oozie/webrequest/legacy_tsvs/coordinator_upload.xml
A oozie/webrequest/legacy_tsvs/generate_5xx-misc_tsv.hql
A oozie/webrequest/legacy_tsvs/generate_5xx-mobile_tsv.hql
A oozie/webrequest/legacy_tsvs/generate_5xx-text_tsv.hql
A oozie/webrequest/legacy_tsvs/generate_5xx-upload_tsv.hql
9 files changed, 495 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/19/189219/1

diff --git a/oozie/webrequest/legacy_tsvs/bundle.properties 
b/oozie/webrequest/legacy_tsvs/bundle.properties
index 85eef77..ed9128d 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.properties
+++ b/oozie/webrequest/legacy_tsvs/bundle.properties
@@ -32,10 +32,13 @@
 # webrequest_sources they depend on. This allows to for example turn off upload
 # and have the coordinators that depend on upload block, while the coordinators
 # that do not depend on upload continue to run.
+coordinator_misc_file               = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc.xml
 coordinator_misc_mobile_text_file   = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
 coordinator_mobile_file             = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile.xml
 coordinator_mobile_text_file        = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text.xml
 coordinator_mobile_text_upload_file = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
+coordinator_text_file               = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_text.xml
+coordinator_upload_file             = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_upload.xml
 
 # HDFS path to workflow to run.
 workflow_file                       = 
${oozie_directory}/webrequest/legacy_tsvs/workflow.xml
diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml 
b/oozie/webrequest/legacy_tsvs/bundle.xml
index d179401..7d65232 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.xml
+++ b/oozie/webrequest/legacy_tsvs/bundle.xml
@@ -9,10 +9,13 @@
         </property>
 
         <!-- Required properties. -->
+        <property><name>coordinator_misc_file</name></property>
         <property><name>coordinator_misc_mobile_text_file</name></property>
         <property><name>coordinator_mobile_file</name></property>
         <property><name>coordinator_mobile_text_file</name></property>
         <property><name>coordinator_mobile_text_upload_file</name></property>
+        <property><name>coordinator_text_file</name></property>
+        <property><name>coordinator_upload_file</name></property>
         <property><name>name_node</name></property>
         <property><name>job_tracker</name></property>
         <property><name>start_time</name></property>
@@ -159,4 +162,78 @@
             </property>
         </configuration>
     </coordinator>
+
+    <!-- TODO: Add 'bits' per-source 5xx variant, once it's turned on again -->
+
+    <coordinator name="webrequest_legacy_tsvs-5xx-misc">
+        <app-path>${coordinator_misc_file}</app-path>
+        <configuration>
+            <property>
+                <name>aspect_name</name>
+                <value>5xx-misc</value>
+            </property>
+            <property>
+                <name>aspect_tsv_archive_directory</name>
+                <value>${webrequest_archive_directory}/5xx-misc</value>
+            </property>
+            <property>
+                <name>hour_offset</name>
+                <value>9</value>
+            </property>
+        </configuration>
+    </coordinator>
+
+    <coordinator name="webrequest_legacy_tsvs-5xx-mobile">
+        <app-path>${coordinator_mobile_file}</app-path>
+        <configuration>
+            <property>
+                <name>aspect_name</name>
+                <value>5xx-mobile</value>
+            </property>
+            <property>
+                <name>aspect_tsv_archive_directory</name>
+                <value>${webrequest_archive_directory}/5xx-mobile</value>
+            </property>
+            <property>
+                <name>hour_offset</name>
+                <value>10</value>
+            </property>
+        </configuration>
+    </coordinator>
+
+    <coordinator name="webrequest_legacy_tsvs-5xx-text">
+        <app-path>${coordinator_text_file}</app-path>
+        <configuration>
+            <property>
+                <name>aspect_name</name>
+                <value>5xx-text</value>
+            </property>
+            <property>
+                <name>aspect_tsv_archive_directory</name>
+                <value>${webrequest_archive_directory}/5xx-text</value>
+            </property>
+            <property>
+                <name>hour_offset</name>
+                <value>11</value>
+            </property>
+        </configuration>
+    </coordinator>
+
+    <coordinator name="webrequest_legacy_tsvs-5xx-upload">
+        <app-path>${coordinator_upload_file}</app-path>
+        <configuration>
+            <property>
+                <name>aspect_name</name>
+                <value>5xx-upload</value>
+            </property>
+            <property>
+                <name>aspect_tsv_archive_directory</name>
+                <value>${webrequest_archive_directory}/5xx-upload</value>
+            </property>
+            <property>
+                <name>hour_offset</name>
+                <value>12</value>
+            </property>
+        </configuration>
+    </coordinator>
 </bundle-app>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_misc.xml 
b/oozie/webrequest/legacy_tsvs/coordinator_misc.xml
new file mode 100644
index 0000000..301ae05
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/coordinator_misc.xml
@@ -0,0 +1,137 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+    name="webrequest_legacy_tsvs-${aspect_name}-coord"
+    frequency="${coord:days(1)}"
+    start="${start_time}"
+    end="${stop_time}"
+    timezone="Universal">
+
+    <parameters>
+        <property>
+            <name>queue_name</name>
+            <value>default</value>
+        </property>
+
+        <!-- Required properties. -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>start_time</name></property>
+        <property><name>hour_offset</name></property>
+        <property><name>stop_time</name></property>
+        <property><name>webrequest_datasets_file</name></property>
+        <property><name>webrequest_data_directory</name></property>
+        <property><name>hive_site_xml</name></property>
+        <property><name>artifacts_directory</name></property>
+        <property><name>workflow_file</name></property>
+        <property><name>webrequest_table</name></property>
+        <property><name>mark_directory_done_workflow_file</name></property>
+        <property><name>temporary_directory</name></property>
+        <property><name>aspect_tsv_archive_directory</name></property>
+        <property><name>archive_job_output_workflow_file</name></property>
+        <property><name>aspect_name</name></property>
+    </parameters>
+
+    <controls>
+        <!--
+        By having materialized jobs not timeout, we ease backfilling incidents
+        after recoverable hiccups on the dataset producers.
+        -->
+        <timeout>-1</timeout>
+
+        <!--
+        Since the job only runs daily, even low concurrency allows to catch up
+        pretty fast. Hence, we can limit concurrency to 1, as the tsvs 
typically
+        process quite some data.
+        -->
+        <concurrency>1</concurrency>
+
+        <!--
+        In order to keep backfilling after an incident simple, we only start
+        throttling materialization after 4 days.
+        Due to the low concurrency, and low discrepancy between progressing
+        time, and expected availability of datasets, we should typically have
+        far less materialized jobs.
+        -->
+        <throttle>4</throttle>
+    </controls>
+
+    <datasets>
+        <include>${webrequest_datasets_file}</include>
+    </datasets>
+
+    <input-events>
+        <data-in name="webrequest_misc" dataset="webrequest_misc">
+            <start-instance>${coord:current(0)}</start-instance>
+            <end-instance>${coord:current(23)}</end-instance>
+        </data-in>
+
+        <!--
+        The following dataset is not required as input to the
+        workflow, but only helps to delay running it.
+
+        The 24 hours offset is for a full day. And we subtract 2 hours, as
+        webrequest processing starts 2 hours after the respective hour.
+        -->
+        <data-in name="delay" dataset="webrequest_text">
+            <instance>${coord:current(24-2+hour_offset)}</instance>
+        </data-in>
+    </input-events>
+
+    <action>
+        <workflow>
+            <app-path>${workflow_file}</app-path>
+            <configuration>
+
+                <!-- Pass these properties through to the workflow -->
+                
<property><name>name_node</name><value>${name_node}</value></property>
+                
<property><name>job_tracker</name><value>${job_tracker}</value></property>
+                
<property><name>queue_name</name><value>${queue_name}</value></property>
+
+                <property>
+                    <name>hive_site_xml</name>
+                    <value>${hive_site_xml}</value>
+                </property>
+                <property>
+                    <name>artifacts_directory</name>
+                    <value>${artifacts_directory}</value>
+                </property>
+                <property>
+                    <name>webrequest_table</name>
+                    <value>${webrequest_table}</value>
+                </property>
+                <property>
+                    <name>year</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"yyyy")}</value>
+                </property>
+                <property>
+                    <name>month</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"MM")}</value>
+                </property>
+                <property>
+                    <name>day</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"dd")}</value>
+                </property>
+                <property>
+                    <name>mark_directory_done_workflow_file</name>
+                    <value>${mark_directory_done_workflow_file}</value>
+                </property>
+                <property>
+                    <name>temporary_directory</name>
+                    <value>${temporary_directory}</value>
+                </property>
+                <property>
+                    <name>aspect_name</name>
+                    <value>${aspect_name}</value>
+                </property>
+                <property>
+                    <name>aspect_tsv_archive_directory</name>
+                    <value>${aspect_tsv_archive_directory}</value>
+                </property>
+                <property>
+                    <name>archive_job_output_workflow_file</name>
+                    <value>${archive_job_output_workflow_file}</value>
+                </property>
+            </configuration>
+        </workflow>
+    </action>
+</coordinator-app>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_text.xml 
b/oozie/webrequest/legacy_tsvs/coordinator_text.xml
new file mode 100644
index 0000000..19bd118
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/coordinator_text.xml
@@ -0,0 +1,137 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+    name="webrequest_legacy_tsvs-${aspect_name}-coord"
+    frequency="${coord:days(1)}"
+    start="${start_time}"
+    end="${stop_time}"
+    timezone="Universal">
+
+    <parameters>
+        <property>
+            <name>queue_name</name>
+            <value>default</value>
+        </property>
+
+        <!-- Required properties. -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>start_time</name></property>
+        <property><name>hour_offset</name></property>
+        <property><name>stop_time</name></property>
+        <property><name>webrequest_datasets_file</name></property>
+        <property><name>webrequest_data_directory</name></property>
+        <property><name>hive_site_xml</name></property>
+        <property><name>artifacts_directory</name></property>
+        <property><name>workflow_file</name></property>
+        <property><name>webrequest_table</name></property>
+        <property><name>mark_directory_done_workflow_file</name></property>
+        <property><name>temporary_directory</name></property>
+        <property><name>aspect_tsv_archive_directory</name></property>
+        <property><name>archive_job_output_workflow_file</name></property>
+        <property><name>aspect_name</name></property>
+    </parameters>
+
+    <controls>
+        <!--
+        By having materialized jobs not timeout, we ease backfilling incidents
+        after recoverable hiccups on the dataset producers.
+        -->
+        <timeout>-1</timeout>
+
+        <!--
+        Since the job only runs daily, even low concurrency allows to catch up
+        pretty fast. Hence, we can limit concurrency to 1, as the tsvs 
typically
+        process quite some data.
+        -->
+        <concurrency>1</concurrency>
+
+        <!--
+        In order to keep backfilling after an incident simple, we only start
+        throttling materialization after 4 days.
+        Due to the low concurrency, and low discrepancy between progressing
+        time, and expected availability of datasets, we should typically have
+        far less materialized jobs.
+        -->
+        <throttle>4</throttle>
+    </controls>
+
+    <datasets>
+        <include>${webrequest_datasets_file}</include>
+    </datasets>
+
+    <input-events>
+        <data-in name="webrequest_text" dataset="webrequest_text">
+            <start-instance>${coord:current(0)}</start-instance>
+            <end-instance>${coord:current(23)}</end-instance>
+        </data-in>
+
+        <!--
+        The following dataset is not required as input to the
+        workflow, but only helps to delay running it.
+
+        The 24 hours offset is for a full day. And we subtract 2 hours, as
+        webrequest processing starts 2 hours after the respective hour.
+        -->
+        <data-in name="delay" dataset="webrequest_text">
+            <instance>${coord:current(24-2+hour_offset)}</instance>
+        </data-in>
+    </input-events>
+
+    <action>
+        <workflow>
+            <app-path>${workflow_file}</app-path>
+            <configuration>
+
+                <!-- Pass these properties through to the workflow -->
+                
<property><name>name_node</name><value>${name_node}</value></property>
+                
<property><name>job_tracker</name><value>${job_tracker}</value></property>
+                
<property><name>queue_name</name><value>${queue_name}</value></property>
+
+                <property>
+                    <name>hive_site_xml</name>
+                    <value>${hive_site_xml}</value>
+                </property>
+                <property>
+                    <name>artifacts_directory</name>
+                    <value>${artifacts_directory}</value>
+                </property>
+                <property>
+                    <name>webrequest_table</name>
+                    <value>${webrequest_table}</value>
+                </property>
+                <property>
+                    <name>year</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"yyyy")}</value>
+                </property>
+                <property>
+                    <name>month</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"MM")}</value>
+                </property>
+                <property>
+                    <name>day</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"dd")}</value>
+                </property>
+                <property>
+                    <name>mark_directory_done_workflow_file</name>
+                    <value>${mark_directory_done_workflow_file}</value>
+                </property>
+                <property>
+                    <name>temporary_directory</name>
+                    <value>${temporary_directory}</value>
+                </property>
+                <property>
+                    <name>aspect_name</name>
+                    <value>${aspect_name}</value>
+                </property>
+                <property>
+                    <name>aspect_tsv_archive_directory</name>
+                    <value>${aspect_tsv_archive_directory}</value>
+                </property>
+                <property>
+                    <name>archive_job_output_workflow_file</name>
+                    <value>${archive_job_output_workflow_file}</value>
+                </property>
+            </configuration>
+        </workflow>
+    </action>
+</coordinator-app>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_upload.xml 
b/oozie/webrequest/legacy_tsvs/coordinator_upload.xml
new file mode 100644
index 0000000..ba6d94a
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/coordinator_upload.xml
@@ -0,0 +1,137 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+    name="webrequest_legacy_tsvs-${aspect_name}-coord"
+    frequency="${coord:days(1)}"
+    start="${start_time}"
+    end="${stop_time}"
+    timezone="Universal">
+
+    <parameters>
+        <property>
+            <name>queue_name</name>
+            <value>default</value>
+        </property>
+
+        <!-- Required properties. -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>start_time</name></property>
+        <property><name>hour_offset</name></property>
+        <property><name>stop_time</name></property>
+        <property><name>webrequest_datasets_file</name></property>
+        <property><name>webrequest_data_directory</name></property>
+        <property><name>hive_site_xml</name></property>
+        <property><name>artifacts_directory</name></property>
+        <property><name>workflow_file</name></property>
+        <property><name>webrequest_table</name></property>
+        <property><name>mark_directory_done_workflow_file</name></property>
+        <property><name>temporary_directory</name></property>
+        <property><name>aspect_tsv_archive_directory</name></property>
+        <property><name>archive_job_output_workflow_file</name></property>
+        <property><name>aspect_name</name></property>
+    </parameters>
+
+    <controls>
+        <!--
+        By having materialized jobs not timeout, we ease backfilling incidents
+        after recoverable hiccups on the dataset producers.
+        -->
+        <timeout>-1</timeout>
+
+        <!--
+        Since the job only runs daily, even low concurrency allows to catch up
+        pretty fast. Hence, we can limit concurrency to 1, as the tsvs 
typically
+        process quite some data.
+        -->
+        <concurrency>1</concurrency>
+
+        <!--
+        In order to keep backfilling after an incident simple, we only start
+        throttling materialization after 4 days.
+        Due to the low concurrency, and low discrepancy between progressing
+        time, and expected availability of datasets, we should typically have
+        far less materialized jobs.
+        -->
+        <throttle>4</throttle>
+    </controls>
+
+    <datasets>
+        <include>${webrequest_datasets_file}</include>
+    </datasets>
+
+    <input-events>
+        <data-in name="webrequest_upload" dataset="webrequest_upload">
+            <start-instance>${coord:current(0)}</start-instance>
+            <end-instance>${coord:current(23)}</end-instance>
+        </data-in>
+
+        <!--
+        The following dataset is not required as input to the
+        workflow, but only helps to delay running it.
+
+        The 24 hours offset is for a full day. And we subtract 2 hours, as
+        webrequest processing starts 2 hours after the respective hour.
+        -->
+        <data-in name="delay" dataset="webrequest_text">
+            <instance>${coord:current(24-2+hour_offset)}</instance>
+        </data-in>
+    </input-events>
+
+    <action>
+        <workflow>
+            <app-path>${workflow_file}</app-path>
+            <configuration>
+
+                <!-- Pass these properties through to the workflow -->
+                
<property><name>name_node</name><value>${name_node}</value></property>
+                
<property><name>job_tracker</name><value>${job_tracker}</value></property>
+                
<property><name>queue_name</name><value>${queue_name}</value></property>
+
+                <property>
+                    <name>hive_site_xml</name>
+                    <value>${hive_site_xml}</value>
+                </property>
+                <property>
+                    <name>artifacts_directory</name>
+                    <value>${artifacts_directory}</value>
+                </property>
+                <property>
+                    <name>webrequest_table</name>
+                    <value>${webrequest_table}</value>
+                </property>
+                <property>
+                    <name>year</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"yyyy")}</value>
+                </property>
+                <property>
+                    <name>month</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"MM")}</value>
+                </property>
+                <property>
+                    <name>day</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"dd")}</value>
+                </property>
+                <property>
+                    <name>mark_directory_done_workflow_file</name>
+                    <value>${mark_directory_done_workflow_file}</value>
+                </property>
+                <property>
+                    <name>temporary_directory</name>
+                    <value>${temporary_directory}</value>
+                </property>
+                <property>
+                    <name>aspect_name</name>
+                    <value>${aspect_name}</value>
+                </property>
+                <property>
+                    <name>aspect_tsv_archive_directory</name>
+                    <value>${aspect_tsv_archive_directory}</value>
+                </property>
+                <property>
+                    <name>archive_job_output_workflow_file</name>
+                    <value>${archive_job_output_workflow_file}</value>
+                </property>
+            </configuration>
+        </workflow>
+    </action>
+</coordinator-app>
diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx-misc_tsv.hql 
b/oozie/webrequest/legacy_tsvs/generate_5xx-misc_tsv.hql
new file mode 120000
index 0000000..a8f4179
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/generate_5xx-misc_tsv.hql
@@ -0,0 +1 @@
+generate_5xx_tsv.hql
\ No newline at end of file
diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx-mobile_tsv.hql 
b/oozie/webrequest/legacy_tsvs/generate_5xx-mobile_tsv.hql
new file mode 120000
index 0000000..a8f4179
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/generate_5xx-mobile_tsv.hql
@@ -0,0 +1 @@
+generate_5xx_tsv.hql
\ No newline at end of file
diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx-text_tsv.hql 
b/oozie/webrequest/legacy_tsvs/generate_5xx-text_tsv.hql
new file mode 120000
index 0000000..a8f4179
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/generate_5xx-text_tsv.hql
@@ -0,0 +1 @@
+generate_5xx_tsv.hql
\ No newline at end of file
diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx-upload_tsv.hql 
b/oozie/webrequest/legacy_tsvs/generate_5xx-upload_tsv.hql
new file mode 120000
index 0000000..a8f4179
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/generate_5xx-upload_tsv.hql
@@ -0,0 +1 @@
+generate_5xx_tsv.hql
\ No newline at end of file

-- 
To view, visit https://gerrit.wikimedia.org/r/189219
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id2adfa9ddb60789210a36b5472878890de580ace
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <christ...@quelltextlich.at>
Gerrit-Reviewer: Ottomata <o...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to