Ottomata has submitted this change and it was merged.
Change subject: Use bits when producing legacy tsvs
......................................................................
Use bits when producing legacy tsvs
Change-Id: I5a64ec87c11466e480058ca1d9b29022fbbeeb9f
---
M bin/refinery-dump-status-webrequest-partitions
M diagrams/oozie-overview.dia
M oozie/webrequest/legacy_tsvs/bundle.properties
M oozie/webrequest/legacy_tsvs/bundle.xml
C oozie/webrequest/legacy_tsvs/coordinator_bits.xml
R oozie/webrequest/legacy_tsvs/coordinator_bits_misc_mobile_text.xml
A oozie/webrequest/legacy_tsvs/generate_5xx-bits_tsv.hql
7 files changed, 66 insertions(+), 48 deletions(-)
Approvals:
Ottomata: Verified; Looks good to me, approved
diff --git a/bin/refinery-dump-status-webrequest-partitions
b/bin/refinery-dump-status-webrequest-partitions
index f8fc75e..93f4bd7 100755
--- a/bin/refinery-dump-status-webrequest-partitions
+++ b/bin/refinery-dump-status-webrequest-partitions
@@ -69,7 +69,7 @@
DATASET_VISIBILITIES["$DATASET"]=no
}
-add_dataset "legacy_tsvs" "daily" " 5xx | 5xx-misc |5xx-mobile |
5xx-text |5xx-upload | api | edits | glam_nara | mobile |
sampled | zero |"
+add_dataset "legacy_tsvs" "daily" " 5xx | 5xx-bits | 5xx-misc
|5xx-mobile | 5xx-text |5xx-upload | api | edits | glam_nara |
mobile | sampled | zero |"
add_dataset "pagecounts_all_sites" "hourly" " file name date | page |
project |"
add_dataset "pagecounts_raw" "hourly" " file name date | page | project |"
add_dataset "raw_webrequest" "hourly" " bits | misc | mobile | text |
upload |"
@@ -336,6 +336,7 @@
local BASE
for BASE in \
5xx/5xx \
+ 5xx-bits/5xx-bits \
5xx-misc/5xx-misc \
5xx-mobile/5xx-mobile \
5xx-text/5xx-text \
diff --git a/diagrams/oozie-overview.dia b/diagrams/oozie-overview.dia
index e70d6f4..95926f6 100644
--- a/diagrams/oozie-overview.dia
+++ b/diagrams/oozie-overview.dia
Binary files differ
diff --git a/oozie/webrequest/legacy_tsvs/bundle.properties
b/oozie/webrequest/legacy_tsvs/bundle.properties
index ed9128d..f58e252 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.properties
+++ b/oozie/webrequest/legacy_tsvs/bundle.properties
@@ -8,75 +8,76 @@
# .xml files exist there when this job is submitted.
-name_node = hdfs://analytics-hadoop
-job_tracker =
resourcemanager.analytics.eqiad.wmnet:8032
-queue_name = default
+name_node = hdfs://analytics-hadoop
+job_tracker =
resourcemanager.analytics.eqiad.wmnet:8032
+queue_name = default
# Base path in HDFS to refinery.
# When submitting this job for production, you should
# override this to point directly at a deployed
# directory name, and not the 'symbolic' 'current' directory.
# E.g. /wmf/refinery/2015-01-05T17.59.18Z--7bb7f07
-refinery_directory = ${name_node}/wmf/refinery/current
+refinery_directory = ${name_node}/wmf/refinery/current
# HDFS path to artifacts that will be used by this job.
# E.g. refinery-hive.jar should exist here.
-artifacts_directory = ${refinery_directory}/artifacts
+artifacts_directory = ${refinery_directory}/artifacts
# Base path in HDFS to oozie files.
# Other files will be used relative to this path.
-oozie_directory = ${refinery_directory}/oozie
+oozie_directory = ${refinery_directory}/oozie
# HDFS paths to the coordinators to run.
# All of them are essentially the same coordinator and differ only in the
# webrequest_sources they depend on. This allows to for example turn off upload
# and have the coordinators that depend on upload block, while the coordinators
# that do not depend on upload continue to run.
-coordinator_misc_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc.xml
-coordinator_misc_mobile_text_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
-coordinator_mobile_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile.xml
-coordinator_mobile_text_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text.xml
-coordinator_mobile_text_upload_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
-coordinator_text_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_text.xml
-coordinator_upload_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_upload.xml
+coordinator_bits_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_bits.xml
+coordinator_bits_misc_mobile_text_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_bits_misc_mobile_text.xml
+coordinator_misc_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc.xml
+coordinator_mobile_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile.xml
+coordinator_mobile_text_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text.xml
+coordinator_mobile_text_upload_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
+coordinator_text_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_text.xml
+coordinator_upload_file =
${oozie_directory}/webrequest/legacy_tsvs/coordinator_upload.xml
# HDFS path to workflow to run.
-workflow_file =
${oozie_directory}/webrequest/legacy_tsvs/workflow.xml
+workflow_file =
${oozie_directory}/webrequest/legacy_tsvs/workflow.xml
# HDFS path to webrequest dataset definition
-webrequest_datasets_file =
${oozie_directory}/webrequest/datasets.xml
+webrequest_datasets_file =
${oozie_directory}/webrequest/datasets.xml
# Time to start running this coordinator.
# Make sure to have hours and minutes at 0!
-start_time = 2014-04-01T00:00Z
+start_time = 2014-04-01T00:00Z
# Time to stop running this coordinator. Year 3000 == never!
-stop_time = 3000-01-01T00:00Z
+stop_time = 3000-01-01T00:00Z
# HDFS path to workflow to mark a directory as done
-mark_directory_done_workflow_file =
${oozie_directory}/util/mark_directory_done/workflow.xml
+mark_directory_done_workflow_file =
${oozie_directory}/util/mark_directory_done/workflow.xml
-archive_job_output_workflow_file =
${oozie_directory}/util/archive_job_output/workflow.xml
+archive_job_output_workflow_file =
${oozie_directory}/util/archive_job_output/workflow.xml
# HDFS path to hive-site.xml file. This is needed to run hive actions.
-hive_site_xml =
${oozie_directory}/util/hive/hive-site.xml
+hive_site_xml =
${oozie_directory}/util/hive/hive-site.xml
# Table to write hourly pagecounts to (fully qualified)
-webrequest_table = wmf.webrequest
+webrequest_table = wmf.webrequest
# HDFS path to directory where webrequst data is time bucketed.
-webrequest_data_directory = ${name_node}/wmf/data/wmf/webrequest
+webrequest_data_directory = ${name_node}/wmf/data/wmf/webrequest
# Temporary directory
-temporary_directory = ${name_node}/tmp
+temporary_directory = ${name_node}/tmp
# Archive base directory
-archive_directory = ${name_node}/wmf/data/archive
+archive_directory = ${name_node}/wmf/data/archive
# Archive directory for webrequest data
-webrequest_archive_directory = ${archive_directory}/webrequest
+webrequest_archive_directory = ${archive_directory}/webrequest
# Coordintator to start.
-oozie.bundle.application.path =
${oozie_directory}/webrequest/legacy_tsvs/bundle.xml
-oozie.use.system.libpath = true
-oozie.action.external.stats.write = true
+oozie.bundle.application.path =
${oozie_directory}/webrequest/legacy_tsvs/bundle.xml
+oozie.use.system.libpath = true
+oozie.action.external.stats.write = true
diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml
b/oozie/webrequest/legacy_tsvs/bundle.xml
index 7d65232..f4b442e 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.xml
+++ b/oozie/webrequest/legacy_tsvs/bundle.xml
@@ -9,8 +9,9 @@
</property>
<!-- Required properties. -->
+ <property><name>coordinator_bits_file</name></property>
+
<property><name>coordinator_bits_misc_mobile_text_file</name></property>
<property><name>coordinator_misc_file</name></property>
- <property><name>coordinator_misc_mobile_text_file</name></property>
<property><name>coordinator_mobile_file</name></property>
<property><name>coordinator_mobile_text_file</name></property>
<property><name>coordinator_mobile_text_upload_file</name></property>
@@ -145,8 +146,11 @@
</coordinator>
<coordinator name="webrequest_legacy_tsvs-5xx">
- <!-- TODO: Add 'bits', once it's turned on again -->
- <app-path>${coordinator_misc_mobile_text_file}</app-path>
+ <!--
+ No 'upload', as that explicitly got excluded in the upd2log
+ filters.
+ -->
+ <app-path>${coordinator_bits_misc_mobile_text_file}</app-path>
<configuration>
<property>
<name>aspect_name</name>
@@ -163,7 +167,23 @@
</configuration>
</coordinator>
- <!-- TODO: Add 'bits' per-source 5xx variant, once it's turned on again -->
+ <coordinator name="webrequest_legacy_tsvs-5xx-bits">
+ <app-path>${coordinator_bits_file}</app-path>
+ <configuration>
+ <property>
+ <name>aspect_name</name>
+ <value>5xx-bits</value>
+ </property>
+ <property>
+ <name>aspect_tsv_archive_directory</name>
+ <value>${webrequest_archive_directory}/5xx-bits</value>
+ </property>
+ <property>
+ <name>hour_offset</name>
+ <value>9</value>
+ </property>
+ </configuration>
+ </coordinator>
<coordinator name="webrequest_legacy_tsvs-5xx-misc">
<app-path>${coordinator_misc_file}</app-path>
@@ -178,7 +198,7 @@
</property>
<property>
<name>hour_offset</name>
- <value>9</value>
+ <value>10</value>
</property>
</configuration>
</coordinator>
@@ -196,7 +216,7 @@
</property>
<property>
<name>hour_offset</name>
- <value>10</value>
+ <value>11</value>
</property>
</configuration>
</coordinator>
@@ -214,7 +234,7 @@
</property>
<property>
<name>hour_offset</name>
- <value>11</value>
+ <value>12</value>
</property>
</configuration>
</coordinator>
@@ -232,7 +252,7 @@
</property>
<property>
<name>hour_offset</name>
- <value>12</value>
+ <value>13</value>
</property>
</configuration>
</coordinator>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
b/oozie/webrequest/legacy_tsvs/coordinator_bits.xml
similarity index 91%
copy from oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
copy to oozie/webrequest/legacy_tsvs/coordinator_bits.xml
index 9f17521..abaaf2f 100644
--- a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
+++ b/oozie/webrequest/legacy_tsvs/coordinator_bits.xml
@@ -60,17 +60,7 @@
</datasets>
<input-events>
- <data-in name="webrequest_misc" dataset="webrequest_misc">
- <start-instance>${coord:current(0)}</start-instance>
- <end-instance>${coord:current(23)}</end-instance>
- </data-in>
-
- <data-in name="webrequest_mobile" dataset="webrequest_mobile">
- <start-instance>${coord:current(0)}</start-instance>
- <end-instance>${coord:current(23)}</end-instance>
- </data-in>
-
- <data-in name="webrequest_text" dataset="webrequest_text">
+ <data-in name="webrequest_bits" dataset="webrequest_bits">
<start-instance>${coord:current(0)}</start-instance>
<end-instance>${coord:current(23)}</end-instance>
</data-in>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
b/oozie/webrequest/legacy_tsvs/coordinator_bits_misc_mobile_text.xml
similarity index 96%
rename from oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
rename to oozie/webrequest/legacy_tsvs/coordinator_bits_misc_mobile_text.xml
index 9f17521..a013528 100644
--- a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
+++ b/oozie/webrequest/legacy_tsvs/coordinator_bits_misc_mobile_text.xml
@@ -60,6 +60,11 @@
</datasets>
<input-events>
+ <data-in name="webrequest_bits" dataset="webrequest_bits">
+ <start-instance>${coord:current(0)}</start-instance>
+ <end-instance>${coord:current(23)}</end-instance>
+ </data-in>
+
<data-in name="webrequest_misc" dataset="webrequest_misc">
<start-instance>${coord:current(0)}</start-instance>
<end-instance>${coord:current(23)}</end-instance>
diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx-bits_tsv.hql
b/oozie/webrequest/legacy_tsvs/generate_5xx-bits_tsv.hql
new file mode 120000
index 0000000..a8f4179
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/generate_5xx-bits_tsv.hql
@@ -0,0 +1 @@
+generate_5xx_tsv.hql
\ No newline at end of file
--
To view, visit https://gerrit.wikimedia.org/r/186970
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I5a64ec87c11466e480058ca1d9b29022fbbeeb9f
Gerrit-PatchSet: 3
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits