Hello Ottomata, I'd like you to do a code review. Please visit
https://gerrit.wikimedia.org/r/189219 to review the following change. Change subject: Add per webrequest_source 5xx tsvs to legacy_tsvs ...................................................................... Add per webrequest_source 5xx tsvs to legacy_tsvs Ops called out that they would not want to see results from the misc cluster when grepping for production issues, and this probably was an oversight to have them in the udp2log tsvs. But they'd also voiced that generally, having 5xx for misc might be useful. As filtering tsvs for their webrequest_sources is generally hard, we split all of them out into separate files. That way, Ops can grep for production issues without having to deal with misc. They for example can easily focus on only text or only mobile. And they also have prepared (separate) files for misc's 5xx. Change-Id: Id2adfa9ddb60789210a36b5472878890de580ace --- M oozie/webrequest/legacy_tsvs/bundle.properties M oozie/webrequest/legacy_tsvs/bundle.xml A oozie/webrequest/legacy_tsvs/coordinator_misc.xml A oozie/webrequest/legacy_tsvs/coordinator_text.xml A oozie/webrequest/legacy_tsvs/coordinator_upload.xml A oozie/webrequest/legacy_tsvs/generate_5xx-misc_tsv.hql A oozie/webrequest/legacy_tsvs/generate_5xx-mobile_tsv.hql A oozie/webrequest/legacy_tsvs/generate_5xx-text_tsv.hql A oozie/webrequest/legacy_tsvs/generate_5xx-upload_tsv.hql 9 files changed, 495 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery refs/changes/19/189219/1 diff --git a/oozie/webrequest/legacy_tsvs/bundle.properties b/oozie/webrequest/legacy_tsvs/bundle.properties index 85eef77..ed9128d 100644 --- a/oozie/webrequest/legacy_tsvs/bundle.properties +++ b/oozie/webrequest/legacy_tsvs/bundle.properties @@ -32,10 +32,13 @@ # webrequest_sources they depend on. This allows to for example turn off upload # and have the coordinators that depend on upload block, while the coordinators # that do not depend on upload continue to run. +coordinator_misc_file = ${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc.xml coordinator_misc_mobile_text_file = ${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml coordinator_mobile_file = ${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile.xml coordinator_mobile_text_file = ${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text.xml coordinator_mobile_text_upload_file = ${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml +coordinator_text_file = ${oozie_directory}/webrequest/legacy_tsvs/coordinator_text.xml +coordinator_upload_file = ${oozie_directory}/webrequest/legacy_tsvs/coordinator_upload.xml # HDFS path to workflow to run. workflow_file = ${oozie_directory}/webrequest/legacy_tsvs/workflow.xml diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml b/oozie/webrequest/legacy_tsvs/bundle.xml index d179401..7d65232 100644 --- a/oozie/webrequest/legacy_tsvs/bundle.xml +++ b/oozie/webrequest/legacy_tsvs/bundle.xml @@ -9,10 +9,13 @@ </property> <!-- Required properties. --> + <property><name>coordinator_misc_file</name></property> <property><name>coordinator_misc_mobile_text_file</name></property> <property><name>coordinator_mobile_file</name></property> <property><name>coordinator_mobile_text_file</name></property> <property><name>coordinator_mobile_text_upload_file</name></property> + <property><name>coordinator_text_file</name></property> + <property><name>coordinator_upload_file</name></property> <property><name>name_node</name></property> <property><name>job_tracker</name></property> <property><name>start_time</name></property> @@ -159,4 +162,78 @@ </property> </configuration> </coordinator> + + <!-- TODO: Add 'bits' per-source 5xx variant, once it's turned on again --> + + <coordinator name="webrequest_legacy_tsvs-5xx-misc"> + <app-path>${coordinator_misc_file}</app-path> + <configuration> + <property> + <name>aspect_name</name> + <value>5xx-misc</value> + </property> + <property> + <name>aspect_tsv_archive_directory</name> + <value>${webrequest_archive_directory}/5xx-misc</value> + </property> + <property> + <name>hour_offset</name> + <value>9</value> + </property> + </configuration> + </coordinator> + + <coordinator name="webrequest_legacy_tsvs-5xx-mobile"> + <app-path>${coordinator_mobile_file}</app-path> + <configuration> + <property> + <name>aspect_name</name> + <value>5xx-mobile</value> + </property> + <property> + <name>aspect_tsv_archive_directory</name> + <value>${webrequest_archive_directory}/5xx-mobile</value> + </property> + <property> + <name>hour_offset</name> + <value>10</value> + </property> + </configuration> + </coordinator> + + <coordinator name="webrequest_legacy_tsvs-5xx-text"> + <app-path>${coordinator_text_file}</app-path> + <configuration> + <property> + <name>aspect_name</name> + <value>5xx-text</value> + </property> + <property> + <name>aspect_tsv_archive_directory</name> + <value>${webrequest_archive_directory}/5xx-text</value> + </property> + <property> + <name>hour_offset</name> + <value>11</value> + </property> + </configuration> + </coordinator> + + <coordinator name="webrequest_legacy_tsvs-5xx-upload"> + <app-path>${coordinator_upload_file}</app-path> + <configuration> + <property> + <name>aspect_name</name> + <value>5xx-upload</value> + </property> + <property> + <name>aspect_tsv_archive_directory</name> + <value>${webrequest_archive_directory}/5xx-upload</value> + </property> + <property> + <name>hour_offset</name> + <value>12</value> + </property> + </configuration> + </coordinator> </bundle-app> diff --git a/oozie/webrequest/legacy_tsvs/coordinator_misc.xml b/oozie/webrequest/legacy_tsvs/coordinator_misc.xml new file mode 100644 index 0000000..301ae05 --- /dev/null +++ b/oozie/webrequest/legacy_tsvs/coordinator_misc.xml @@ -0,0 +1,137 @@ +<?xml version="1.0" encoding="UTF-8"?> +<coordinator-app xmlns="uri:oozie:coordinator:0.4" + name="webrequest_legacy_tsvs-${aspect_name}-coord" + frequency="${coord:days(1)}" + start="${start_time}" + end="${stop_time}" + timezone="Universal"> + + <parameters> + <property> + <name>queue_name</name> + <value>default</value> + </property> + + <!-- Required properties. --> + <property><name>name_node</name></property> + <property><name>job_tracker</name></property> + <property><name>start_time</name></property> + <property><name>hour_offset</name></property> + <property><name>stop_time</name></property> + <property><name>webrequest_datasets_file</name></property> + <property><name>webrequest_data_directory</name></property> + <property><name>hive_site_xml</name></property> + <property><name>artifacts_directory</name></property> + <property><name>workflow_file</name></property> + <property><name>webrequest_table</name></property> + <property><name>mark_directory_done_workflow_file</name></property> + <property><name>temporary_directory</name></property> + <property><name>aspect_tsv_archive_directory</name></property> + <property><name>archive_job_output_workflow_file</name></property> + <property><name>aspect_name</name></property> + </parameters> + + <controls> + <!-- + By having materialized jobs not timeout, we ease backfilling incidents + after recoverable hiccups on the dataset producers. + --> + <timeout>-1</timeout> + + <!-- + Since the job only runs daily, even low concurrency allows to catch up + pretty fast. Hence, we can limit concurrency to 1, as the tsvs typically + process quite some data. + --> + <concurrency>1</concurrency> + + <!-- + In order to keep backfilling after an incident simple, we only start + throttling materialization after 4 days. + Due to the low concurrency, and low discrepancy between progressing + time, and expected availability of datasets, we should typically have + far less materialized jobs. + --> + <throttle>4</throttle> + </controls> + + <datasets> + <include>${webrequest_datasets_file}</include> + </datasets> + + <input-events> + <data-in name="webrequest_misc" dataset="webrequest_misc"> + <start-instance>${coord:current(0)}</start-instance> + <end-instance>${coord:current(23)}</end-instance> + </data-in> + + <!-- + The following dataset is not required as input to the + workflow, but only helps to delay running it. + + The 24 hours offset is for a full day. And we subtract 2 hours, as + webrequest processing starts 2 hours after the respective hour. + --> + <data-in name="delay" dataset="webrequest_text"> + <instance>${coord:current(24-2+hour_offset)}</instance> + </data-in> + </input-events> + + <action> + <workflow> + <app-path>${workflow_file}</app-path> + <configuration> + + <!-- Pass these properties through to the workflow --> + <property><name>name_node</name><value>${name_node}</value></property> + <property><name>job_tracker</name><value>${job_tracker}</value></property> + <property><name>queue_name</name><value>${queue_name}</value></property> + + <property> + <name>hive_site_xml</name> + <value>${hive_site_xml}</value> + </property> + <property> + <name>artifacts_directory</name> + <value>${artifacts_directory}</value> + </property> + <property> + <name>webrequest_table</name> + <value>${webrequest_table}</value> + </property> + <property> + <name>year</name> + <value>${coord:formatTime(coord:nominalTime(), "yyyy")}</value> + </property> + <property> + <name>month</name> + <value>${coord:formatTime(coord:nominalTime(), "MM")}</value> + </property> + <property> + <name>day</name> + <value>${coord:formatTime(coord:nominalTime(), "dd")}</value> + </property> + <property> + <name>mark_directory_done_workflow_file</name> + <value>${mark_directory_done_workflow_file}</value> + </property> + <property> + <name>temporary_directory</name> + <value>${temporary_directory}</value> + </property> + <property> + <name>aspect_name</name> + <value>${aspect_name}</value> + </property> + <property> + <name>aspect_tsv_archive_directory</name> + <value>${aspect_tsv_archive_directory}</value> + </property> + <property> + <name>archive_job_output_workflow_file</name> + <value>${archive_job_output_workflow_file}</value> + </property> + </configuration> + </workflow> + </action> +</coordinator-app> diff --git a/oozie/webrequest/legacy_tsvs/coordinator_text.xml b/oozie/webrequest/legacy_tsvs/coordinator_text.xml new file mode 100644 index 0000000..19bd118 --- /dev/null +++ b/oozie/webrequest/legacy_tsvs/coordinator_text.xml @@ -0,0 +1,137 @@ +<?xml version="1.0" encoding="UTF-8"?> +<coordinator-app xmlns="uri:oozie:coordinator:0.4" + name="webrequest_legacy_tsvs-${aspect_name}-coord" + frequency="${coord:days(1)}" + start="${start_time}" + end="${stop_time}" + timezone="Universal"> + + <parameters> + <property> + <name>queue_name</name> + <value>default</value> + </property> + + <!-- Required properties. --> + <property><name>name_node</name></property> + <property><name>job_tracker</name></property> + <property><name>start_time</name></property> + <property><name>hour_offset</name></property> + <property><name>stop_time</name></property> + <property><name>webrequest_datasets_file</name></property> + <property><name>webrequest_data_directory</name></property> + <property><name>hive_site_xml</name></property> + <property><name>artifacts_directory</name></property> + <property><name>workflow_file</name></property> + <property><name>webrequest_table</name></property> + <property><name>mark_directory_done_workflow_file</name></property> + <property><name>temporary_directory</name></property> + <property><name>aspect_tsv_archive_directory</name></property> + <property><name>archive_job_output_workflow_file</name></property> + <property><name>aspect_name</name></property> + </parameters> + + <controls> + <!-- + By having materialized jobs not timeout, we ease backfilling incidents + after recoverable hiccups on the dataset producers. + --> + <timeout>-1</timeout> + + <!-- + Since the job only runs daily, even low concurrency allows to catch up + pretty fast. Hence, we can limit concurrency to 1, as the tsvs typically + process quite some data. + --> + <concurrency>1</concurrency> + + <!-- + In order to keep backfilling after an incident simple, we only start + throttling materialization after 4 days. + Due to the low concurrency, and low discrepancy between progressing + time, and expected availability of datasets, we should typically have + far less materialized jobs. + --> + <throttle>4</throttle> + </controls> + + <datasets> + <include>${webrequest_datasets_file}</include> + </datasets> + + <input-events> + <data-in name="webrequest_text" dataset="webrequest_text"> + <start-instance>${coord:current(0)}</start-instance> + <end-instance>${coord:current(23)}</end-instance> + </data-in> + + <!-- + The following dataset is not required as input to the + workflow, but only helps to delay running it. + + The 24 hours offset is for a full day. And we subtract 2 hours, as + webrequest processing starts 2 hours after the respective hour. + --> + <data-in name="delay" dataset="webrequest_text"> + <instance>${coord:current(24-2+hour_offset)}</instance> + </data-in> + </input-events> + + <action> + <workflow> + <app-path>${workflow_file}</app-path> + <configuration> + + <!-- Pass these properties through to the workflow --> + <property><name>name_node</name><value>${name_node}</value></property> + <property><name>job_tracker</name><value>${job_tracker}</value></property> + <property><name>queue_name</name><value>${queue_name}</value></property> + + <property> + <name>hive_site_xml</name> + <value>${hive_site_xml}</value> + </property> + <property> + <name>artifacts_directory</name> + <value>${artifacts_directory}</value> + </property> + <property> + <name>webrequest_table</name> + <value>${webrequest_table}</value> + </property> + <property> + <name>year</name> + <value>${coord:formatTime(coord:nominalTime(), "yyyy")}</value> + </property> + <property> + <name>month</name> + <value>${coord:formatTime(coord:nominalTime(), "MM")}</value> + </property> + <property> + <name>day</name> + <value>${coord:formatTime(coord:nominalTime(), "dd")}</value> + </property> + <property> + <name>mark_directory_done_workflow_file</name> + <value>${mark_directory_done_workflow_file}</value> + </property> + <property> + <name>temporary_directory</name> + <value>${temporary_directory}</value> + </property> + <property> + <name>aspect_name</name> + <value>${aspect_name}</value> + </property> + <property> + <name>aspect_tsv_archive_directory</name> + <value>${aspect_tsv_archive_directory}</value> + </property> + <property> + <name>archive_job_output_workflow_file</name> + <value>${archive_job_output_workflow_file}</value> + </property> + </configuration> + </workflow> + </action> +</coordinator-app> diff --git a/oozie/webrequest/legacy_tsvs/coordinator_upload.xml b/oozie/webrequest/legacy_tsvs/coordinator_upload.xml new file mode 100644 index 0000000..ba6d94a --- /dev/null +++ b/oozie/webrequest/legacy_tsvs/coordinator_upload.xml @@ -0,0 +1,137 @@ +<?xml version="1.0" encoding="UTF-8"?> +<coordinator-app xmlns="uri:oozie:coordinator:0.4" + name="webrequest_legacy_tsvs-${aspect_name}-coord" + frequency="${coord:days(1)}" + start="${start_time}" + end="${stop_time}" + timezone="Universal"> + + <parameters> + <property> + <name>queue_name</name> + <value>default</value> + </property> + + <!-- Required properties. --> + <property><name>name_node</name></property> + <property><name>job_tracker</name></property> + <property><name>start_time</name></property> + <property><name>hour_offset</name></property> + <property><name>stop_time</name></property> + <property><name>webrequest_datasets_file</name></property> + <property><name>webrequest_data_directory</name></property> + <property><name>hive_site_xml</name></property> + <property><name>artifacts_directory</name></property> + <property><name>workflow_file</name></property> + <property><name>webrequest_table</name></property> + <property><name>mark_directory_done_workflow_file</name></property> + <property><name>temporary_directory</name></property> + <property><name>aspect_tsv_archive_directory</name></property> + <property><name>archive_job_output_workflow_file</name></property> + <property><name>aspect_name</name></property> + </parameters> + + <controls> + <!-- + By having materialized jobs not timeout, we ease backfilling incidents + after recoverable hiccups on the dataset producers. + --> + <timeout>-1</timeout> + + <!-- + Since the job only runs daily, even low concurrency allows to catch up + pretty fast. Hence, we can limit concurrency to 1, as the tsvs typically + process quite some data. + --> + <concurrency>1</concurrency> + + <!-- + In order to keep backfilling after an incident simple, we only start + throttling materialization after 4 days. + Due to the low concurrency, and low discrepancy between progressing + time, and expected availability of datasets, we should typically have + far less materialized jobs. + --> + <throttle>4</throttle> + </controls> + + <datasets> + <include>${webrequest_datasets_file}</include> + </datasets> + + <input-events> + <data-in name="webrequest_upload" dataset="webrequest_upload"> + <start-instance>${coord:current(0)}</start-instance> + <end-instance>${coord:current(23)}</end-instance> + </data-in> + + <!-- + The following dataset is not required as input to the + workflow, but only helps to delay running it. + + The 24 hours offset is for a full day. And we subtract 2 hours, as + webrequest processing starts 2 hours after the respective hour. + --> + <data-in name="delay" dataset="webrequest_text"> + <instance>${coord:current(24-2+hour_offset)}</instance> + </data-in> + </input-events> + + <action> + <workflow> + <app-path>${workflow_file}</app-path> + <configuration> + + <!-- Pass these properties through to the workflow --> + <property><name>name_node</name><value>${name_node}</value></property> + <property><name>job_tracker</name><value>${job_tracker}</value></property> + <property><name>queue_name</name><value>${queue_name}</value></property> + + <property> + <name>hive_site_xml</name> + <value>${hive_site_xml}</value> + </property> + <property> + <name>artifacts_directory</name> + <value>${artifacts_directory}</value> + </property> + <property> + <name>webrequest_table</name> + <value>${webrequest_table}</value> + </property> + <property> + <name>year</name> + <value>${coord:formatTime(coord:nominalTime(), "yyyy")}</value> + </property> + <property> + <name>month</name> + <value>${coord:formatTime(coord:nominalTime(), "MM")}</value> + </property> + <property> + <name>day</name> + <value>${coord:formatTime(coord:nominalTime(), "dd")}</value> + </property> + <property> + <name>mark_directory_done_workflow_file</name> + <value>${mark_directory_done_workflow_file}</value> + </property> + <property> + <name>temporary_directory</name> + <value>${temporary_directory}</value> + </property> + <property> + <name>aspect_name</name> + <value>${aspect_name}</value> + </property> + <property> + <name>aspect_tsv_archive_directory</name> + <value>${aspect_tsv_archive_directory}</value> + </property> + <property> + <name>archive_job_output_workflow_file</name> + <value>${archive_job_output_workflow_file}</value> + </property> + </configuration> + </workflow> + </action> +</coordinator-app> diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx-misc_tsv.hql b/oozie/webrequest/legacy_tsvs/generate_5xx-misc_tsv.hql new file mode 120000 index 0000000..a8f4179 --- /dev/null +++ b/oozie/webrequest/legacy_tsvs/generate_5xx-misc_tsv.hql @@ -0,0 +1 @@ +generate_5xx_tsv.hql \ No newline at end of file diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx-mobile_tsv.hql b/oozie/webrequest/legacy_tsvs/generate_5xx-mobile_tsv.hql new file mode 120000 index 0000000..a8f4179 --- /dev/null +++ b/oozie/webrequest/legacy_tsvs/generate_5xx-mobile_tsv.hql @@ -0,0 +1 @@ +generate_5xx_tsv.hql \ No newline at end of file diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx-text_tsv.hql b/oozie/webrequest/legacy_tsvs/generate_5xx-text_tsv.hql new file mode 120000 index 0000000..a8f4179 --- /dev/null +++ b/oozie/webrequest/legacy_tsvs/generate_5xx-text_tsv.hql @@ -0,0 +1 @@ +generate_5xx_tsv.hql \ No newline at end of file diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx-upload_tsv.hql b/oozie/webrequest/legacy_tsvs/generate_5xx-upload_tsv.hql new file mode 120000 index 0000000..a8f4179 --- /dev/null +++ b/oozie/webrequest/legacy_tsvs/generate_5xx-upload_tsv.hql @@ -0,0 +1 @@ +generate_5xx_tsv.hql \ No newline at end of file -- To view, visit https://gerrit.wikimedia.org/r/189219 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Id2adfa9ddb60789210a36b5472878890de580ace Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery Gerrit-Branch: master Gerrit-Owner: QChris <christ...@quelltextlich.at> Gerrit-Reviewer: Ottomata <o...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits