[MediaWiki-commits] [Gerrit] Refactor webrequest dataset names - change (analytics/refinery)

Ottomata (Code Review) Tue, 13 Jan 2015 15:04:33 -0800

Ottomata has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/184796


Change subject: Refactor webrequest dataset names
......................................................................

Refactor webrequest dataset names

Refined dataset is now called 'webrequest_.*', and
raw dataset is now called 'webrequest_*_raw'.

This also fixes name of default queue for legacy_tsv jobs to 'default'.

Change-Id: I0285dbbf78014011c48cda53fca3a2fc84635442
---
A hive/webrequest/create_webrequest_raw_table.hql
D hive/webrequest/create_webrequest_refined_table.hql
M hive/webrequest/create_webrequest_table.hql
M oozie/pagecounts-all-sites/load/coordinator.properties
M oozie/webrequest/datasets.xml
A oozie/webrequest/datasets_raw.xml
D oozie/webrequest/datasets_refined.xml
M oozie/webrequest/legacy_tsvs/bundle.properties
M oozie/webrequest/legacy_tsvs/bundle.xml
M oozie/webrequest/legacy_tsvs/coordinator.xml
M oozie/webrequest/legacy_tsvs/workflow.xml
M oozie/webrequest/load/bundle.properties
M oozie/webrequest/load/bundle.xml
M oozie/webrequest/load/check_sequence_statistics_workflow.xml
M oozie/webrequest/load/coordinator.xml
M oozie/webrequest/load/workflow.xml
M oozie/webrequest/refine/bundle.xml
M oozie/webrequest/refine/coordinator.xml
18 files changed, 231 insertions(+), 231 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/96/184796/1

diff --git a/hive/webrequest/create_webrequest_raw_table.hql 
b/hive/webrequest/create_webrequest_raw_table.hql
new file mode 100644
index 0000000..bc5bf8f
--- /dev/null
+++ b/hive/webrequest/create_webrequest_raw_table.hql
@@ -0,0 +1,54 @@
+-- Creates table statement for raw webrequest table.
+--
+-- NOTE:  When choosing partition field types,
+-- one should take into consideration Hive's
+-- insistence on storing partition values
+-- as strings.  See:
+-- https://wikitech.wikimedia.org/wiki/File:Hive_partition_formats.png
+-- and
+-- http://bots.wmflabs.org/~wm-bot/logs/%23wikimedia-analytics/20140721.txt
+--
+-- Parameters:
+--     <none>
+--
+-- Usage
+--     hive -f create_webrequest_refined_table.hql \
+--         --database wmf_raw
+--
+
+CREATE EXTERNAL TABLE IF NOT EXISTS `webrequest` (
+    `hostname`          string  COMMENT 'Source node hostname',
+    `sequence`          bigint  COMMENT 'Per host sequence number',
+    `dt`                string  COMMENT 'Timestame at cache in ISO 8601',
+    `time_firstbyte`    double  COMMENT 'Time to first byte',
+    `ip`                string  COMMENT 'IP of packet at cache',
+    `cache_status`      string  COMMENT 'Cache status',
+    `http_status`       string  COMMENT 'HTTP status of response',
+    `response_size`     bigint  COMMENT 'Response size',
+    `http_method`       string  COMMENT 'HTTP method of request',
+    `uri_host`          string  COMMENT 'Host of request',
+    `uri_path`          string  COMMENT 'Path of request',
+    `uri_query`         string  COMMENT 'Query of request',
+    `content_type`      string  COMMENT 'Content-Type header of response',
+    `referer`           string  COMMENT 'Referer header of request',
+    `x_forwarded_for`   string  COMMENT 'X-Forwarded-For header of request',
+    `user_agent`        string  COMMENT 'User-Agent header of request',
+    `accept_language`   string  COMMENT 'Accept-Language header of request',
+    `x_analytics`       string  COMMENT 'X-Analytics header of response',
+    `range`             string  COMMENT 'Range header of response')
+PARTITIONED BY (
+    `webrequest_source` string  COMMENT 'Source cluster',
+    `year`              int     COMMENT 'Unpadded year of request',
+    `month`             int     COMMENT 'Unpadded month of request',
+    `day`               int     COMMENT 'Unpadded day of request',
+    `hour`              int     COMMENT 'Unpadded hour of request')
+ROW FORMAT SERDE
+    'org.apache.hcatalog.data.JsonSerDe'
+-- We only care about the INPUTFORMAT, not the OUTPUTFORMAT. But
+-- Hive's syntax does not allow to specify one without the
+-- other. Hence, we give both and use a default for the OUTPUTFORMAT.
+STORED AS INPUTFORMAT
+    'org.apache.hadoop.mapred.SequenceFileInputFormat'
+OUTPUTFORMAT
+    'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+;
diff --git a/hive/webrequest/create_webrequest_refined_table.hql 
b/hive/webrequest/create_webrequest_refined_table.hql
deleted file mode 100644
index c8c326e..0000000
--- a/hive/webrequest/create_webrequest_refined_table.hql
+++ /dev/null
@@ -1,55 +0,0 @@
--- Creates table statement for refined webrequest table.
---
--- NOTE:  When choosing partition field types,
--- one should take into consideration Hive's
--- insistence on storing partition values
--- as strings.  See:
--- https://wikitech.wikimedia.org/wiki/File:Hive_partition_formats.png
--- and
--- http://bots.wmflabs.org/~wm-bot/logs/%23wikimedia-analytics/20140721.txt
---
--- Parameters:
---     <none>
---
--- Usage
---     hive -f create_webrequest_refinfed_table.hql --database wmf
---
---
-
-CREATE TABLE IF NOT EXISTS `webrequest`(
-    `hostname` string COMMENT 'Cache node hostname that served this request',
-    `sequence` bigint COMMENT 'Sequence number of request on source cache 
instance',
-    `dt` string COMMENT 'YYYY-MM-DDTHH:mm:ssZ timestamp',
-    `time_firstbyte` double COMMENT 'time until the first byte was served',
-    `ip` string,
-    `cache_status` string,
-    `http_status` string,
-    `response_size` bigint COMMENT 'Response size in bytes',
-    `http_method` string COMMENT 'Request HTTP method',
-    `uri_host` string,
-    `uri_path` string,
-    `uri_query` string,
-    `content_type` string COMMENT 'ContentType of response',
-    `referer` string,
-    `x_forwarded_for` string COMMENT 'X-Forwarded-For header',
-    `user_agent` string,
-    `accept_language` string COMMENT 'AcceptLanguage header',
-    `x_analytics` string COMMENT 'X-Analytics header',
-    `range` string COMMENT 'Range field for multipart files',
-    `is_pageview` boolean COMMENT 'Indicates if this record was marked as a 
pageview during refinement'
-)
-PARTITIONED BY (
-    `webrequest_source` string COMMENT 'Source cluster',
-    `year` int COMMENT 'Unpadded year of request',
-    `month` int COMMENT 'Unpadded month of request',
-    `day` int COMMENT 'Unpadded day of request',
-    `hour` int COMMENT 'Unpadded hour of request')
-CLUSTERED BY(hostname, sequence) INTO 64 BUCKETS
-ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
-STORED AS
-    INPUTFORMAT
-        'parquet.hive.DeprecatedParquetInputFormat'
-    OUTPUTFORMAT
-        'parquet.hive.DeprecatedParquetOutputFormat'
-LOCATION '/wmf/data/wmf/webrequest'
-;
diff --git a/hive/webrequest/create_webrequest_table.hql 
b/hive/webrequest/create_webrequest_table.hql
index 14b6965..c8c326e 100644
--- a/hive/webrequest/create_webrequest_table.hql
+++ b/hive/webrequest/create_webrequest_table.hql
@@ -1,4 +1,4 @@
--- Creates table statement for raw webrequest table.
+-- Creates table statement for refined webrequest table.
 --
 -- NOTE:  When choosing partition field types,
 -- one should take into consideration Hive's
@@ -12,43 +12,44 @@
 --     <none>
 --
 -- Usage
---     hive -f create_webrequest_table.hql \
---         --database wmf_raw
+--     hive -f create_webrequest_refinfed_table.hql --database wmf
+--
 --
 
-CREATE EXTERNAL TABLE IF NOT EXISTS `webrequest` (
-    `hostname`          string  COMMENT 'Source node hostname',
-    `sequence`          bigint  COMMENT 'Per host sequence number',
-    `dt`                string  COMMENT 'Timestame at cache in ISO 8601',
-    `time_firstbyte`    double  COMMENT 'Time to first byte',
-    `ip`                string  COMMENT 'IP of packet at cache',
-    `cache_status`      string  COMMENT 'Cache status',
-    `http_status`       string  COMMENT 'HTTP status of response',
-    `response_size`     bigint  COMMENT 'Response size',
-    `http_method`       string  COMMENT 'HTTP method of request',
-    `uri_host`          string  COMMENT 'Host of request',
-    `uri_path`          string  COMMENT 'Path of request',
-    `uri_query`         string  COMMENT 'Query of request',
-    `content_type`      string  COMMENT 'Content-Type header of response',
-    `referer`           string  COMMENT 'Referer header of request',
-    `x_forwarded_for`   string  COMMENT 'X-Forwarded-For header of request',
-    `user_agent`        string  COMMENT 'User-Agent header of request',
-    `accept_language`   string  COMMENT 'Accept-Language header of request',
-    `x_analytics`       string  COMMENT 'X-Analytics header of response',
-    `range`             string  COMMENT 'Range header of response')
+CREATE TABLE IF NOT EXISTS `webrequest`(
+    `hostname` string COMMENT 'Cache node hostname that served this request',
+    `sequence` bigint COMMENT 'Sequence number of request on source cache 
instance',
+    `dt` string COMMENT 'YYYY-MM-DDTHH:mm:ssZ timestamp',
+    `time_firstbyte` double COMMENT 'time until the first byte was served',
+    `ip` string,
+    `cache_status` string,
+    `http_status` string,
+    `response_size` bigint COMMENT 'Response size in bytes',
+    `http_method` string COMMENT 'Request HTTP method',
+    `uri_host` string,
+    `uri_path` string,
+    `uri_query` string,
+    `content_type` string COMMENT 'ContentType of response',
+    `referer` string,
+    `x_forwarded_for` string COMMENT 'X-Forwarded-For header',
+    `user_agent` string,
+    `accept_language` string COMMENT 'AcceptLanguage header',
+    `x_analytics` string COMMENT 'X-Analytics header',
+    `range` string COMMENT 'Range field for multipart files',
+    `is_pageview` boolean COMMENT 'Indicates if this record was marked as a 
pageview during refinement'
+)
 PARTITIONED BY (
-    `webrequest_source` string  COMMENT 'Source cluster',
-    `year`              int     COMMENT 'Unpadded year of request',
-    `month`             int     COMMENT 'Unpadded month of request',
-    `day`               int     COMMENT 'Unpadded day of request',
-    `hour`              int     COMMENT 'Unpadded hour of request')
-ROW FORMAT SERDE
-    'org.apache.hcatalog.data.JsonSerDe'
--- We only care about the INPUTFORMAT, not the OUTPUTFORMAT. But
--- Hive's syntax does not allow to specify one without the
--- other. Hence, we give both and use a default for the OUTPUTFORMAT.
-STORED AS INPUTFORMAT
-    'org.apache.hadoop.mapred.SequenceFileInputFormat'
-OUTPUTFORMAT
-    'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+    `webrequest_source` string COMMENT 'Source cluster',
+    `year` int COMMENT 'Unpadded year of request',
+    `month` int COMMENT 'Unpadded month of request',
+    `day` int COMMENT 'Unpadded day of request',
+    `hour` int COMMENT 'Unpadded hour of request')
+CLUSTERED BY(hostname, sequence) INTO 64 BUCKETS
+ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
+STORED AS
+    INPUTFORMAT
+        'parquet.hive.DeprecatedParquetInputFormat'
+    OUTPUTFORMAT
+        'parquet.hive.DeprecatedParquetOutputFormat'
+LOCATION '/wmf/data/wmf/webrequest'
 ;
diff --git a/oozie/pagecounts-all-sites/load/coordinator.properties 
b/oozie/pagecounts-all-sites/load/coordinator.properties
index 499c2d3..375cb76 100644
--- a/oozie/pagecounts-all-sites/load/coordinator.properties
+++ b/oozie/pagecounts-all-sites/load/coordinator.properties
@@ -39,13 +39,13 @@
 hive_site_xml                       = 
${oozie_directory}/util/hive/hive-site.xml
 
 # Table to read webrequests from (fully qualified)
-webrequest_table                    = wmf_raw.webrequest
+webrequest_table                    = wmf.webrequest
 
 # Table to write hourly pagecounts to (fully qualified)
 pagecounts_all_sites_table          = wmf.pagecounts_all_sites
 
 # HDFS paths to directories where webrequest data is time bucketed.
-webrequest_data_directory           = ${name_node}/wmf/data/raw/webrequest
+webrequest_data_directory           = ${name_node}/wmf/data/wmf/webrequest
 
 # HDFS path to directory where pagecounts-all-sites data is time bucketed.
 pagecounts_all_sites_data_directory = 
${name_node}/wmf/data/wmf/pagecounts-all-sites
diff --git a/oozie/webrequest/datasets.xml b/oozie/webrequest/datasets.xml
index 7056ac5..6fab32e 100644
--- a/oozie/webrequest/datasets.xml
+++ b/oozie/webrequest/datasets.xml
@@ -1,84 +1,49 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
-Defines reusable datasets for raw webrequest data.
+Defines reusable datasets for refined webrequest data.
 Use this dataset in your coordinator.xml files by setting:
 
     ${start_time}     - the initial instance of your data.
                         Example: 2014-04-01T00:00Z
-    ${webrequest_data_directory} - Path to directory where data is time 
bucketed.
-                        Example: /wmf/data/raw/webrequest
+    ${webrequest_data_directory} - Path to directory where refined data is 
time bucketed.
+                        Example: /wmf/data/wmf/webrequest
 -->
 
 <datasets>
-    <!--
-    The webrequest_*_unchecked datasets should be used for cases where you do
-    not care if the sequence stats have been checked.  This will simply include
-    any imported hourly data directories that exist.
-    -->
-    <dataset name="webrequest_bits_unchecked"
-             frequency="${coord:hours(1)}"
-             initial-instance="${start_time}"
-             timezone="Universal">
-        
<uri-template>${webrequest_data_directory}/webrequest_bits/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
-        <done-flag></done-flag>
-    </dataset>
-    <dataset name="webrequest_mobile_unchecked"
-             frequency="${coord:hours(1)}"
-             initial-instance="${start_time}"
-             timezone="Universal">
-        
<uri-template>${webrequest_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
-        <done-flag></done-flag>
-    </dataset>
-    <dataset name="webrequest_text_unchecked"
-             frequency="${coord:hours(1)}"
-             initial-instance="${start_time}"
-             timezone="Universal">
-        
<uri-template>${webrequest_data_directory}/webrequest_text/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
-        <done-flag></done-flag>
-    </dataset>
-    <dataset name="webrequest_upload_unchecked"
-             frequency="${coord:hours(1)}"
-             initial-instance="${start_time}"
-             timezone="Universal">
-        
<uri-template>${webrequest_data_directory}/webrequest_upload/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
-        <done-flag></done-flag>
-    </dataset>
 
     <!--
-    The webrequest_* datasets should be used if you want to be
-    sure that you are only working with hourly imports for which
-    sequence stats have been checked.  These directories have an
-    empty _SUCCESS flag created in them once they have been checked
-    and it has been determined that the expected number of requests
-    equals the actual number of entires for this hour.
+    The webrequest_*_refined datasets contain the same data as the
+    above two 'raw' datasets, except that they use a more efficient
+    storage format, and contain extra information.
+
+    This dataset does not yet include upload or bits.
+
+    To unpad MONTH, DAY, and HOUR, we force coercion to a number by
+    adding 0.
+
+    Note that we do not use “${...}” but “${"$"}{...}", as dataset files are
+    passed to EL twice in cascade, and in the first EL level, ${MONTH}
+    evaluates to the string “${MONTH}”. Hence, we escape the dollar sign in
+    “${....}" to “${"$"}{...}”. At the first EL level, “${"$"}” gets turned
+    into a dollar sign, and “{...}”  is just passed along. Hence, we arrive
+    at “${...}” as input for the second EL level. There, the variables hold
+    their expected values, and we can start unpadding them.
     -->
-    <dataset name="webrequest_bits"
-             frequency="${coord:hours(1)}"
-             initial-instance="${start_time}"
-             timezone="Universal">
-        
<uri-template>${webrequest_data_directory}/webrequest_bits/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
-        <done-flag>_SUCCESS</done-flag>
-    </dataset>
     <dataset name="webrequest_mobile"
              frequency="${coord:hours(1)}"
              initial-instance="${start_time}"
              timezone="Universal">
-        
<uri-template>${webrequest_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
-        <done-flag>_SUCCESS</done-flag>
+        
<uri-template>${webrequest_data_directory}/webrequest_source=mobile/year=${YEAR}/month=${"$"}{MONTH
 + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
+        <done-flag></done-flag>
     </dataset>
     <dataset name="webrequest_text"
              frequency="${coord:hours(1)}"
              initial-instance="${start_time}"
              timezone="Universal">
-        
<uri-template>${webrequest_data_directory}/webrequest_text/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
-        <done-flag>_SUCCESS</done-flag>
+        
<uri-template>${webrequest_data_directory}/webrequest_source=text/year=${YEAR}/month=${"$"}{MONTH
 + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
+        <done-flag></done-flag>
+
     </dataset>
-    <dataset name="webrequest_upload"
-             frequency="${coord:hours(1)}"
-             initial-instance="${start_time}"
-             timezone="Universal">
-        
<uri-template>${webrequest_data_directory}/webrequest_upload/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
-        <done-flag>_SUCCESS</done-flag>
-    </dataset>
+
 
 </datasets>
diff --git a/oozie/webrequest/datasets_raw.xml 
b/oozie/webrequest/datasets_raw.xml
new file mode 100644
index 0000000..0d5c6ee
--- /dev/null
+++ b/oozie/webrequest/datasets_raw.xml
@@ -0,0 +1,84 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Defines reusable datasets for raw webrequest data.
+Use this dataset in your coordinator.xml files by setting:
+
+    ${start_time}     - the initial instance of your data.
+                        Example: 2014-04-01T00:00Z
+    ${webrequest_raw_data_directory} - Path to directory where data is time 
bucketed.
+                        Example: /wmf/data/raw/webrequest
+-->
+
+<datasets>
+    <!--
+    The webrequest_*_raw_unchecked datasets should be used for cases where you 
do
+    not care if the sequence stats have been checked.  This will simply include
+    any imported hourly data directories that exist.
+    -->
+    <dataset name="webrequest_bits_raw_unchecked"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_raw_data_directory}/webrequest_bits/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag></done-flag>
+    </dataset>
+    <dataset name="webrequest_mobile_raw_unchecked"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_raw_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag></done-flag>
+    </dataset>
+    <dataset name="webrequest_text_raw_unchecked"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_raw_data_directory}/webrequest_text/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag></done-flag>
+    </dataset>
+    <dataset name="webrequest_upload_raw_unchecked"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_raw_data_directory}/webrequest_upload/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag></done-flag>
+    </dataset>
+
+    <!--
+    The webrequest_*_raw datasets should be used if you want to be
+    sure that you are only working with hourly imports for which
+    sequence stats have been checked.  These directories have an
+    empty _SUCCESS flag created in them once they have been checked
+    and it has been determined that the expected number of requests
+    equals the actual number of entires for this hour.
+    -->
+    <dataset name="webrequest_bits_raw"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_raw_data_directory}/webrequest_bits/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag>_SUCCESS</done-flag>
+    </dataset>
+    <dataset name="webrequest_mobile_raw"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_raw_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag>_SUCCESS</done-flag>
+    </dataset>
+    <dataset name="webrequest_text_raw"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_raw_data_directory}/webrequest_text/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag>_SUCCESS</done-flag>
+    </dataset>
+    <dataset name="webrequest_upload_raw"
+             frequency="${coord:hours(1)}"
+             initial-instance="${start_time}"
+             timezone="Universal">
+        
<uri-template>${webrequest_raw_data_directory}/webrequest_upload/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template>
+        <done-flag>_SUCCESS</done-flag>
+    </dataset>
+
+</datasets>
diff --git a/oozie/webrequest/datasets_refined.xml 
b/oozie/webrequest/datasets_refined.xml
deleted file mode 100644
index 2ac80b8..0000000
--- a/oozie/webrequest/datasets_refined.xml
+++ /dev/null
@@ -1,51 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-Defines reusable datasets for refined webrequest data.
-Use this dataset in your coordinator.xml files by setting:
-
-    ${start_time}     - the initial instance of your data.
-                        Example: 2014-04-01T00:00Z
-    ${webrequest_refined_data_directory} - Path to directory where refined 
data is time bucketed.
-                        Example: /wmf/data/wmf/webrequest
--->
-
-<datasets>
-
-    <!--
-    The webrequest_*_refined datasets contain the same data as the
-    above two 'raw' datasets, except that they use a more efficient
-    storage format, and contain extra information.
-
-    This dataset does not yet include upload or bits.
-
-    To unpad MONTH, DAY, and HOUR, we force coercion to a number by
-    adding 0.
-
-    Note that we do not use “${...}” but “${"$"}{...}", as dataset files are
-    passed to EL twice in cascade, and in the first EL level, ${MONTH}
-    evaluates to the string “${MONTH}”. Hence, we escape the dollar sign in
-    “${....}" to “${"$"}{...}”. At the first EL level, “${"$"}” gets turned
-    into a dollar sign, and “{...}”  is just passed along. Hence, we arrive
-    at “${...}” as input for the second EL level. There, the variables hold
-    their expected values, and we can start unpadding them.
-
-    TODO: I would like to eventually name this data set 'webrequest_mobile',
-          etc.  and rename the datasets.xml datasets to webrequest_mobile_raw, 
etc.
-    -->
-    <dataset name="webrequest_mobile_refined"
-             frequency="${coord:hours(1)}"
-             initial-instance="${start_time}"
-             timezone="Universal">
-        
<uri-template>${webrequest_refined_data_directory}/webrequest_source=mobile/year=${YEAR}/month=${"$"}{MONTH
 + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
-        <done-flag></done-flag>
-    </dataset>
-    <dataset name="webrequest_text_refined"
-             frequency="${coord:hours(1)}"
-             initial-instance="${start_time}"
-             timezone="Universal">
-        
<uri-template>${webrequest_refined_data_directory}/webrequest_source=text/year=${YEAR}/month=${"$"}{MONTH
 + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template>
-        <done-flag></done-flag>
-
-    </dataset>
-
-</datasets>
diff --git a/oozie/webrequest/legacy_tsvs/bundle.properties 
b/oozie/webrequest/legacy_tsvs/bundle.properties
index 48efba0..6ef0ea6 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.properties
+++ b/oozie/webrequest/legacy_tsvs/bundle.properties
@@ -41,10 +41,10 @@
 hive_site_xml                     = ${oozie_directory}/util/hive/hive-site.xml
 
 # Table to write hourly pagecounts to (fully qualified)
-webrequest_table                  = wmf_raw.webrequest
+webrequest_table                  = wmf.webrequest
 
 # HDFS path to directory where webrequst data is time bucketed.
-webrequest_data_directory         = ${name_node}/wmf/data/raw/webrequest
+webrequest_data_directory         = ${name_node}/wmf/data/wmf/webrequest
 
 # Temporary directory
 temporary_directory               = ${name_node}/tmp
diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml 
b/oozie/webrequest/legacy_tsvs/bundle.xml
index bb1106a..c3270c3 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.xml
+++ b/oozie/webrequest/legacy_tsvs/bundle.xml
@@ -5,7 +5,7 @@
     <parameters>
         <property>
             <name>queue_name</name>
-            <value>adhoc</value>
+            <value>default</value>
         </property>
 
         <!-- Required properties. -->
@@ -16,6 +16,7 @@
         <property><name>stop_time</name></property>
         <property><name>webrequest_datasets_file</name></property>
         <property><name>webrequest_data_directory</name></property>
+
         <property><name>hive_site_xml</name></property>
         <property><name>workflow_file</name></property>
         <property><name>webrequest_table</name></property>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator.xml 
b/oozie/webrequest/legacy_tsvs/coordinator.xml
index e24a44f..fa42c25 100644
--- a/oozie/webrequest/legacy_tsvs/coordinator.xml
+++ b/oozie/webrequest/legacy_tsvs/coordinator.xml
@@ -9,7 +9,7 @@
     <parameters>
         <property>
             <name>queue_name</name>
-            <value>adhoc</value>
+            <value>default</value>
         </property>
 
         <property>
diff --git a/oozie/webrequest/legacy_tsvs/workflow.xml 
b/oozie/webrequest/legacy_tsvs/workflow.xml
index 4f4b342..2d463c0 100644
--- a/oozie/webrequest/legacy_tsvs/workflow.xml
+++ b/oozie/webrequest/legacy_tsvs/workflow.xml
@@ -5,7 +5,7 @@
     <parameters>
         <property>
             <name>queue_name</name>
-            <value>adhoc</value>
+            <value>default</value>
         </property>
         <property>
             <name>aspect_compression_ending</name>
diff --git a/oozie/webrequest/load/bundle.properties 
b/oozie/webrequest/load/bundle.properties
index e42954b..bc7205f 100644
--- a/oozie/webrequest/load/bundle.properties
+++ b/oozie/webrequest/load/bundle.properties
@@ -22,7 +22,7 @@
 workflow_file                     = 
${oozie_directory}/webrequest/load/workflow.xml
 
 # HDFS path to webrequest dataset definition
-datasets_file                     = ${oozie_directory}/webrequest/datasets.xml
+datasets_raw_file                 = 
${oozie_directory}/webrequest/datasets_raw.xml
 
 # Initial import time of the webrequest dataset.
 start_time                        = 2014-04-01T00:00Z
@@ -48,8 +48,8 @@
 # Base directory for obviously faulty hosts
 faulty_hosts_directory            = 
${name_node}/wmf/data/raw/webrequests_faulty_hosts
 
-# HDFS path to directory where webrequest data is time bucketed.
-webrequest_data_directory         = ${name_node}/wmf/data/raw/webrequest
+# HDFS path to directory where webrequest raw data is time bucketed.
+webrequest_raw_data_directory     = ${name_node}/wmf/data/raw/webrequest
 
 # Coordintator to start.
 oozie.bundle.application.path     = 
${oozie_directory}/webrequest/load/bundle.xml
diff --git a/oozie/webrequest/load/bundle.xml b/oozie/webrequest/load/bundle.xml
index df602b7..b3d2534 100644
--- a/oozie/webrequest/load/bundle.xml
+++ b/oozie/webrequest/load/bundle.xml
@@ -15,7 +15,7 @@
         <property><name>workflow_file</name></property>
         <property><name>start_time</name></property>
         <property><name>stop_time</name></property>
-        <property><name>webrequest_data_directory</name></property>
+        <property><name>webrequest_raw_data_directory</name></property>
 
         <property><name>hive_site_xml</name></property>
         <property><name>add_partition_workflow_file</name></property>
@@ -25,7 +25,7 @@
         <property><name>mark_directory_done_workflow_file</name></property>
     </parameters>
 
-    <coordinator name='hive_webrequest_load-webrequest-bits'>
+    <coordinator name='hive_webrequest_raw_load-webrequest_bits'>
         <app-path>${coordinator_file}</app-path>
         <configuration>
             <property>
@@ -35,7 +35,7 @@
         </configuration>
     </coordinator>
 
-    <coordinator name='hive_webrequest_load-webrequest-mobile'>
+    <coordinator name='hive_webrequest_raw_load-webrequest_mobile'>
         <app-path>${coordinator_file}</app-path>
         <configuration>
             <property>
@@ -45,7 +45,7 @@
         </configuration>
     </coordinator>
 
-    <coordinator name='hive_webrequest_load-webrequest-text'>
+    <coordinator name='hive_webrequest_raw_load-webrequest_text'>
         <app-path>${coordinator_file}</app-path>
         <configuration>
             <property>
@@ -55,7 +55,7 @@
         </configuration>
     </coordinator>
 
-    <coordinator name='hive_webrequest_load-webrequest-upload'>
+    <coordinator name='hive_webrequest_raw_load-webrequest_upload'>
         <app-path>${coordinator_file}</app-path>
         <configuration>
             <property>
diff --git a/oozie/webrequest/load/check_sequence_statistics_workflow.xml 
b/oozie/webrequest/load/check_sequence_statistics_workflow.xml
index 3c46bf0..71f03b4 100644
--- a/oozie/webrequest/load/check_sequence_statistics_workflow.xml
+++ b/oozie/webrequest/load/check_sequence_statistics_workflow.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <workflow-app xmlns="uri:oozie:workflow:0.4"
-    
name="check_sequence_statistics-webrequest-${webrequest_source},${year},${month},${day},${hour}-wf">
+    
name="check_sequence_statistics-webrequest_raw-${webrequest_source},${year},${month},${day},${hour}-wf">
 
     <parameters>
         <property>
diff --git a/oozie/webrequest/load/coordinator.xml 
b/oozie/webrequest/load/coordinator.xml
index 77045d9..87cd7b5 100644
--- a/oozie/webrequest/load/coordinator.xml
+++ b/oozie/webrequest/load/coordinator.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <coordinator-app xmlns="uri:oozie:coordinator:0.4"
-    name="hive_webrequest_load-${table}-${webrequest_source}-coord"
+    name="hive_webrequest_raw_load-${table}-${webrequest_source}-coord"
     frequency="${coord:hours(1)}"
     start="${start_time}"
     end="${stop_time}"
@@ -18,7 +18,7 @@
         <property><name>workflow_file</name></property>
         <property><name>start_time</name></property>
         <property><name>stop_time</name></property>
-        <property><name>webrequest_data_directory</name></property>
+        <property><name>webrequest_raw_data_directory</name></property>
 
         <property><name>hive_site_xml</name></property>
         <property><name>add_partition_workflow_file</name></property>
@@ -79,7 +79,7 @@
     </datasets>
 
     <input-events>
-        <data-in name="input" 
dataset="webrequest_${webrequest_source}_unchecked">
+        <data-in name="input" 
dataset="webrequest_${webrequest_source}_raw_unchecked">
             <instance>${coord:current(0)}</instance>
         </data-in>
         <!--
@@ -89,7 +89,7 @@
         might get created /before/ writing for the current dataset
         finishes).
          -->
-        <data-in name="ready_indicator" 
dataset="webrequest_${webrequest_source}_unchecked">
+        <data-in name="ready_indicator" 
dataset="webrequest_${webrequest_source}_raw_unchecked">
             <instance>${coord:current(2)}</instance>
         </data-in>
     </input-events>
diff --git a/oozie/webrequest/load/workflow.xml 
b/oozie/webrequest/load/workflow.xml
index bc2d271..345ddf5 100644
--- a/oozie/webrequest/load/workflow.xml
+++ b/oozie/webrequest/load/workflow.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <workflow-app xmlns="uri:oozie:workflow:0.4"
-    
name="hive_webrequest_load-${table}-${webrequest_source},${year},${month},${day},${hour}-wf">
+    
name="hive_webrequest_raw_load-${table}-${webrequest_source},${year},${month},${day},${hour}-wf">
 
     <parameters>
         <property>
diff --git a/oozie/webrequest/refine/bundle.xml 
b/oozie/webrequest/refine/bundle.xml
index 47b74aa..72048a5 100644
--- a/oozie/webrequest/refine/bundle.xml
+++ b/oozie/webrequest/refine/bundle.xml
@@ -15,7 +15,8 @@
         <property><name>workflow_file</name></property>
         <property><name>start_time</name></property>
         <property><name>stop_time</name></property>
-        <property><name>webrequest_data_directory</name></property>
+        <property><name>webrequest_raw_data_directory</name></property>
+        <property><name>datasets_raw_file</name></property>
 
         <property><name>hive_site_xml</name></property>
         <property><name>artifacts_directory</name></property>
@@ -37,7 +38,7 @@
     </coordinator>
     -->
 
-    <coordinator name='refine-webrequest-mobile'>
+    <coordinator name='refine-webrequest_mobile'>
         <app-path>${coordinator_file}</app-path>
         <configuration>
             <property>
@@ -47,7 +48,7 @@
         </configuration>
     </coordinator>
 
-    <coordinator name='refine-webrequest-text'>
+    <coordinator name='refine-webrequest_text'>
         <app-path>${coordinator_file}</app-path>
         <configuration>
             <property>
diff --git a/oozie/webrequest/refine/coordinator.xml 
b/oozie/webrequest/refine/coordinator.xml
index ef44a36..e747846 100644
--- a/oozie/webrequest/refine/coordinator.xml
+++ b/oozie/webrequest/refine/coordinator.xml
@@ -18,9 +18,9 @@
         <property><name>workflow_file</name></property>
         <property><name>start_time</name></property>
         <property><name>stop_time</name></property>
-        <property><name>webrequest_data_directory</name></property>
+        <property><name>webrequest_raw_data_directory</name></property>
+        <property><name>datasets_raw_file</name></property>
 
-        <property><name>datasets_file</name></property>
         <property><name>hive_site_xml</name></property>
         <property><name>artifacts_directory</name></property>
         <property><name>source_table</name></property>
@@ -61,11 +61,11 @@
         Include the given datasets_file file.  This should
         define the "webrequest" dataset for this coordinator.
         -->
-        <include>${datasets_file}</include>
+        <include>${datasets_raw_file}</include>
     </datasets>
 
     <input-events>
-        <data-in name="input" dataset="webrequest_${webrequest_source}">
+        <data-in name="input" dataset="webrequest_${webrequest_source}_raw">
             <instance>${coord:current(0)}</instance>
         </data-in>
     </input-events>

-- 
To view, visit https://gerrit.wikimedia.org/r/184796
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I0285dbbf78014011c48cda53fca3a2fc84635442
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Refactor webrequest dataset names - change (analytics/refinery)

Reply via email to