Ottomata has uploaded a new change for review. https://gerrit.wikimedia.org/r/184796
Change subject: Refactor webrequest dataset names ...................................................................... Refactor webrequest dataset names Refined dataset is now called 'webrequest_.*', and raw dataset is now called 'webrequest_*_raw'. This also fixes name of default queue for legacy_tsv jobs to 'default'. Change-Id: I0285dbbf78014011c48cda53fca3a2fc84635442 --- A hive/webrequest/create_webrequest_raw_table.hql D hive/webrequest/create_webrequest_refined_table.hql M hive/webrequest/create_webrequest_table.hql M oozie/pagecounts-all-sites/load/coordinator.properties M oozie/webrequest/datasets.xml A oozie/webrequest/datasets_raw.xml D oozie/webrequest/datasets_refined.xml M oozie/webrequest/legacy_tsvs/bundle.properties M oozie/webrequest/legacy_tsvs/bundle.xml M oozie/webrequest/legacy_tsvs/coordinator.xml M oozie/webrequest/legacy_tsvs/workflow.xml M oozie/webrequest/load/bundle.properties M oozie/webrequest/load/bundle.xml M oozie/webrequest/load/check_sequence_statistics_workflow.xml M oozie/webrequest/load/coordinator.xml M oozie/webrequest/load/workflow.xml M oozie/webrequest/refine/bundle.xml M oozie/webrequest/refine/coordinator.xml 18 files changed, 231 insertions(+), 231 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery refs/changes/96/184796/1 diff --git a/hive/webrequest/create_webrequest_raw_table.hql b/hive/webrequest/create_webrequest_raw_table.hql new file mode 100644 index 0000000..bc5bf8f --- /dev/null +++ b/hive/webrequest/create_webrequest_raw_table.hql @@ -0,0 +1,54 @@ +-- Creates table statement for raw webrequest table. +-- +-- NOTE: When choosing partition field types, +-- one should take into consideration Hive's +-- insistence on storing partition values +-- as strings. See: +-- https://wikitech.wikimedia.org/wiki/File:Hive_partition_formats.png +-- and +-- http://bots.wmflabs.org/~wm-bot/logs/%23wikimedia-analytics/20140721.txt +-- +-- Parameters: +-- <none> +-- +-- Usage +-- hive -f create_webrequest_refined_table.hql \ +-- --database wmf_raw +-- + +CREATE EXTERNAL TABLE IF NOT EXISTS `webrequest` ( + `hostname` string COMMENT 'Source node hostname', + `sequence` bigint COMMENT 'Per host sequence number', + `dt` string COMMENT 'Timestame at cache in ISO 8601', + `time_firstbyte` double COMMENT 'Time to first byte', + `ip` string COMMENT 'IP of packet at cache', + `cache_status` string COMMENT 'Cache status', + `http_status` string COMMENT 'HTTP status of response', + `response_size` bigint COMMENT 'Response size', + `http_method` string COMMENT 'HTTP method of request', + `uri_host` string COMMENT 'Host of request', + `uri_path` string COMMENT 'Path of request', + `uri_query` string COMMENT 'Query of request', + `content_type` string COMMENT 'Content-Type header of response', + `referer` string COMMENT 'Referer header of request', + `x_forwarded_for` string COMMENT 'X-Forwarded-For header of request', + `user_agent` string COMMENT 'User-Agent header of request', + `accept_language` string COMMENT 'Accept-Language header of request', + `x_analytics` string COMMENT 'X-Analytics header of response', + `range` string COMMENT 'Range header of response') +PARTITIONED BY ( + `webrequest_source` string COMMENT 'Source cluster', + `year` int COMMENT 'Unpadded year of request', + `month` int COMMENT 'Unpadded month of request', + `day` int COMMENT 'Unpadded day of request', + `hour` int COMMENT 'Unpadded hour of request') +ROW FORMAT SERDE + 'org.apache.hcatalog.data.JsonSerDe' +-- We only care about the INPUTFORMAT, not the OUTPUTFORMAT. But +-- Hive's syntax does not allow to specify one without the +-- other. Hence, we give both and use a default for the OUTPUTFORMAT. +STORED AS INPUTFORMAT + 'org.apache.hadoop.mapred.SequenceFileInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' +; diff --git a/hive/webrequest/create_webrequest_refined_table.hql b/hive/webrequest/create_webrequest_refined_table.hql deleted file mode 100644 index c8c326e..0000000 --- a/hive/webrequest/create_webrequest_refined_table.hql +++ /dev/null @@ -1,55 +0,0 @@ --- Creates table statement for refined webrequest table. --- --- NOTE: When choosing partition field types, --- one should take into consideration Hive's --- insistence on storing partition values --- as strings. See: --- https://wikitech.wikimedia.org/wiki/File:Hive_partition_formats.png --- and --- http://bots.wmflabs.org/~wm-bot/logs/%23wikimedia-analytics/20140721.txt --- --- Parameters: --- <none> --- --- Usage --- hive -f create_webrequest_refinfed_table.hql --database wmf --- --- - -CREATE TABLE IF NOT EXISTS `webrequest`( - `hostname` string COMMENT 'Cache node hostname that served this request', - `sequence` bigint COMMENT 'Sequence number of request on source cache instance', - `dt` string COMMENT 'YYYY-MM-DDTHH:mm:ssZ timestamp', - `time_firstbyte` double COMMENT 'time until the first byte was served', - `ip` string, - `cache_status` string, - `http_status` string, - `response_size` bigint COMMENT 'Response size in bytes', - `http_method` string COMMENT 'Request HTTP method', - `uri_host` string, - `uri_path` string, - `uri_query` string, - `content_type` string COMMENT 'ContentType of response', - `referer` string, - `x_forwarded_for` string COMMENT 'X-Forwarded-For header', - `user_agent` string, - `accept_language` string COMMENT 'AcceptLanguage header', - `x_analytics` string COMMENT 'X-Analytics header', - `range` string COMMENT 'Range field for multipart files', - `is_pageview` boolean COMMENT 'Indicates if this record was marked as a pageview during refinement' -) -PARTITIONED BY ( - `webrequest_source` string COMMENT 'Source cluster', - `year` int COMMENT 'Unpadded year of request', - `month` int COMMENT 'Unpadded month of request', - `day` int COMMENT 'Unpadded day of request', - `hour` int COMMENT 'Unpadded hour of request') -CLUSTERED BY(hostname, sequence) INTO 64 BUCKETS -ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe' -STORED AS - INPUTFORMAT - 'parquet.hive.DeprecatedParquetInputFormat' - OUTPUTFORMAT - 'parquet.hive.DeprecatedParquetOutputFormat' -LOCATION '/wmf/data/wmf/webrequest' -; diff --git a/hive/webrequest/create_webrequest_table.hql b/hive/webrequest/create_webrequest_table.hql index 14b6965..c8c326e 100644 --- a/hive/webrequest/create_webrequest_table.hql +++ b/hive/webrequest/create_webrequest_table.hql @@ -1,4 +1,4 @@ --- Creates table statement for raw webrequest table. +-- Creates table statement for refined webrequest table. -- -- NOTE: When choosing partition field types, -- one should take into consideration Hive's @@ -12,43 +12,44 @@ -- <none> -- -- Usage --- hive -f create_webrequest_table.hql \ --- --database wmf_raw +-- hive -f create_webrequest_refinfed_table.hql --database wmf +-- -- -CREATE EXTERNAL TABLE IF NOT EXISTS `webrequest` ( - `hostname` string COMMENT 'Source node hostname', - `sequence` bigint COMMENT 'Per host sequence number', - `dt` string COMMENT 'Timestame at cache in ISO 8601', - `time_firstbyte` double COMMENT 'Time to first byte', - `ip` string COMMENT 'IP of packet at cache', - `cache_status` string COMMENT 'Cache status', - `http_status` string COMMENT 'HTTP status of response', - `response_size` bigint COMMENT 'Response size', - `http_method` string COMMENT 'HTTP method of request', - `uri_host` string COMMENT 'Host of request', - `uri_path` string COMMENT 'Path of request', - `uri_query` string COMMENT 'Query of request', - `content_type` string COMMENT 'Content-Type header of response', - `referer` string COMMENT 'Referer header of request', - `x_forwarded_for` string COMMENT 'X-Forwarded-For header of request', - `user_agent` string COMMENT 'User-Agent header of request', - `accept_language` string COMMENT 'Accept-Language header of request', - `x_analytics` string COMMENT 'X-Analytics header of response', - `range` string COMMENT 'Range header of response') +CREATE TABLE IF NOT EXISTS `webrequest`( + `hostname` string COMMENT 'Cache node hostname that served this request', + `sequence` bigint COMMENT 'Sequence number of request on source cache instance', + `dt` string COMMENT 'YYYY-MM-DDTHH:mm:ssZ timestamp', + `time_firstbyte` double COMMENT 'time until the first byte was served', + `ip` string, + `cache_status` string, + `http_status` string, + `response_size` bigint COMMENT 'Response size in bytes', + `http_method` string COMMENT 'Request HTTP method', + `uri_host` string, + `uri_path` string, + `uri_query` string, + `content_type` string COMMENT 'ContentType of response', + `referer` string, + `x_forwarded_for` string COMMENT 'X-Forwarded-For header', + `user_agent` string, + `accept_language` string COMMENT 'AcceptLanguage header', + `x_analytics` string COMMENT 'X-Analytics header', + `range` string COMMENT 'Range field for multipart files', + `is_pageview` boolean COMMENT 'Indicates if this record was marked as a pageview during refinement' +) PARTITIONED BY ( - `webrequest_source` string COMMENT 'Source cluster', - `year` int COMMENT 'Unpadded year of request', - `month` int COMMENT 'Unpadded month of request', - `day` int COMMENT 'Unpadded day of request', - `hour` int COMMENT 'Unpadded hour of request') -ROW FORMAT SERDE - 'org.apache.hcatalog.data.JsonSerDe' --- We only care about the INPUTFORMAT, not the OUTPUTFORMAT. But --- Hive's syntax does not allow to specify one without the --- other. Hence, we give both and use a default for the OUTPUTFORMAT. -STORED AS INPUTFORMAT - 'org.apache.hadoop.mapred.SequenceFileInputFormat' -OUTPUTFORMAT - 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' + `webrequest_source` string COMMENT 'Source cluster', + `year` int COMMENT 'Unpadded year of request', + `month` int COMMENT 'Unpadded month of request', + `day` int COMMENT 'Unpadded day of request', + `hour` int COMMENT 'Unpadded hour of request') +CLUSTERED BY(hostname, sequence) INTO 64 BUCKETS +ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe' +STORED AS + INPUTFORMAT + 'parquet.hive.DeprecatedParquetInputFormat' + OUTPUTFORMAT + 'parquet.hive.DeprecatedParquetOutputFormat' +LOCATION '/wmf/data/wmf/webrequest' ; diff --git a/oozie/pagecounts-all-sites/load/coordinator.properties b/oozie/pagecounts-all-sites/load/coordinator.properties index 499c2d3..375cb76 100644 --- a/oozie/pagecounts-all-sites/load/coordinator.properties +++ b/oozie/pagecounts-all-sites/load/coordinator.properties @@ -39,13 +39,13 @@ hive_site_xml = ${oozie_directory}/util/hive/hive-site.xml # Table to read webrequests from (fully qualified) -webrequest_table = wmf_raw.webrequest +webrequest_table = wmf.webrequest # Table to write hourly pagecounts to (fully qualified) pagecounts_all_sites_table = wmf.pagecounts_all_sites # HDFS paths to directories where webrequest data is time bucketed. -webrequest_data_directory = ${name_node}/wmf/data/raw/webrequest +webrequest_data_directory = ${name_node}/wmf/data/wmf/webrequest # HDFS path to directory where pagecounts-all-sites data is time bucketed. pagecounts_all_sites_data_directory = ${name_node}/wmf/data/wmf/pagecounts-all-sites diff --git a/oozie/webrequest/datasets.xml b/oozie/webrequest/datasets.xml index 7056ac5..6fab32e 100644 --- a/oozie/webrequest/datasets.xml +++ b/oozie/webrequest/datasets.xml @@ -1,84 +1,49 @@ <?xml version="1.0" encoding="UTF-8"?> <!-- -Defines reusable datasets for raw webrequest data. +Defines reusable datasets for refined webrequest data. Use this dataset in your coordinator.xml files by setting: ${start_time} - the initial instance of your data. Example: 2014-04-01T00:00Z - ${webrequest_data_directory} - Path to directory where data is time bucketed. - Example: /wmf/data/raw/webrequest + ${webrequest_data_directory} - Path to directory where refined data is time bucketed. + Example: /wmf/data/wmf/webrequest --> <datasets> - <!-- - The webrequest_*_unchecked datasets should be used for cases where you do - not care if the sequence stats have been checked. This will simply include - any imported hourly data directories that exist. - --> - <dataset name="webrequest_bits_unchecked" - frequency="${coord:hours(1)}" - initial-instance="${start_time}" - timezone="Universal"> - <uri-template>${webrequest_data_directory}/webrequest_bits/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> - <done-flag></done-flag> - </dataset> - <dataset name="webrequest_mobile_unchecked" - frequency="${coord:hours(1)}" - initial-instance="${start_time}" - timezone="Universal"> - <uri-template>${webrequest_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> - <done-flag></done-flag> - </dataset> - <dataset name="webrequest_text_unchecked" - frequency="${coord:hours(1)}" - initial-instance="${start_time}" - timezone="Universal"> - <uri-template>${webrequest_data_directory}/webrequest_text/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> - <done-flag></done-flag> - </dataset> - <dataset name="webrequest_upload_unchecked" - frequency="${coord:hours(1)}" - initial-instance="${start_time}" - timezone="Universal"> - <uri-template>${webrequest_data_directory}/webrequest_upload/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> - <done-flag></done-flag> - </dataset> <!-- - The webrequest_* datasets should be used if you want to be - sure that you are only working with hourly imports for which - sequence stats have been checked. These directories have an - empty _SUCCESS flag created in them once they have been checked - and it has been determined that the expected number of requests - equals the actual number of entires for this hour. + The webrequest_*_refined datasets contain the same data as the + above two 'raw' datasets, except that they use a more efficient + storage format, and contain extra information. + + This dataset does not yet include upload or bits. + + To unpad MONTH, DAY, and HOUR, we force coercion to a number by + adding 0. + + Note that we do not use “${...}” but “${"$"}{...}", as dataset files are + passed to EL twice in cascade, and in the first EL level, ${MONTH} + evaluates to the string “${MONTH}”. Hence, we escape the dollar sign in + “${....}" to “${"$"}{...}”. At the first EL level, “${"$"}” gets turned + into a dollar sign, and “{...}” is just passed along. Hence, we arrive + at “${...}” as input for the second EL level. There, the variables hold + their expected values, and we can start unpadding them. --> - <dataset name="webrequest_bits" - frequency="${coord:hours(1)}" - initial-instance="${start_time}" - timezone="Universal"> - <uri-template>${webrequest_data_directory}/webrequest_bits/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> - <done-flag>_SUCCESS</done-flag> - </dataset> <dataset name="webrequest_mobile" frequency="${coord:hours(1)}" initial-instance="${start_time}" timezone="Universal"> - <uri-template>${webrequest_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> - <done-flag>_SUCCESS</done-flag> + <uri-template>${webrequest_data_directory}/webrequest_source=mobile/year=${YEAR}/month=${"$"}{MONTH + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template> + <done-flag></done-flag> </dataset> <dataset name="webrequest_text" frequency="${coord:hours(1)}" initial-instance="${start_time}" timezone="Universal"> - <uri-template>${webrequest_data_directory}/webrequest_text/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> - <done-flag>_SUCCESS</done-flag> + <uri-template>${webrequest_data_directory}/webrequest_source=text/year=${YEAR}/month=${"$"}{MONTH + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template> + <done-flag></done-flag> + </dataset> - <dataset name="webrequest_upload" - frequency="${coord:hours(1)}" - initial-instance="${start_time}" - timezone="Universal"> - <uri-template>${webrequest_data_directory}/webrequest_upload/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> - <done-flag>_SUCCESS</done-flag> - </dataset> + </datasets> diff --git a/oozie/webrequest/datasets_raw.xml b/oozie/webrequest/datasets_raw.xml new file mode 100644 index 0000000..0d5c6ee --- /dev/null +++ b/oozie/webrequest/datasets_raw.xml @@ -0,0 +1,84 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- +Defines reusable datasets for raw webrequest data. +Use this dataset in your coordinator.xml files by setting: + + ${start_time} - the initial instance of your data. + Example: 2014-04-01T00:00Z + ${webrequest_raw_data_directory} - Path to directory where data is time bucketed. + Example: /wmf/data/raw/webrequest +--> + +<datasets> + <!-- + The webrequest_*_raw_unchecked datasets should be used for cases where you do + not care if the sequence stats have been checked. This will simply include + any imported hourly data directories that exist. + --> + <dataset name="webrequest_bits_raw_unchecked" + frequency="${coord:hours(1)}" + initial-instance="${start_time}" + timezone="Universal"> + <uri-template>${webrequest_raw_data_directory}/webrequest_bits/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> + <done-flag></done-flag> + </dataset> + <dataset name="webrequest_mobile_raw_unchecked" + frequency="${coord:hours(1)}" + initial-instance="${start_time}" + timezone="Universal"> + <uri-template>${webrequest_raw_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> + <done-flag></done-flag> + </dataset> + <dataset name="webrequest_text_raw_unchecked" + frequency="${coord:hours(1)}" + initial-instance="${start_time}" + timezone="Universal"> + <uri-template>${webrequest_raw_data_directory}/webrequest_text/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> + <done-flag></done-flag> + </dataset> + <dataset name="webrequest_upload_raw_unchecked" + frequency="${coord:hours(1)}" + initial-instance="${start_time}" + timezone="Universal"> + <uri-template>${webrequest_raw_data_directory}/webrequest_upload/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> + <done-flag></done-flag> + </dataset> + + <!-- + The webrequest_*_raw datasets should be used if you want to be + sure that you are only working with hourly imports for which + sequence stats have been checked. These directories have an + empty _SUCCESS flag created in them once they have been checked + and it has been determined that the expected number of requests + equals the actual number of entires for this hour. + --> + <dataset name="webrequest_bits_raw" + frequency="${coord:hours(1)}" + initial-instance="${start_time}" + timezone="Universal"> + <uri-template>${webrequest_raw_data_directory}/webrequest_bits/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> + <done-flag>_SUCCESS</done-flag> + </dataset> + <dataset name="webrequest_mobile_raw" + frequency="${coord:hours(1)}" + initial-instance="${start_time}" + timezone="Universal"> + <uri-template>${webrequest_raw_data_directory}/webrequest_mobile/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> + <done-flag>_SUCCESS</done-flag> + </dataset> + <dataset name="webrequest_text_raw" + frequency="${coord:hours(1)}" + initial-instance="${start_time}" + timezone="Universal"> + <uri-template>${webrequest_raw_data_directory}/webrequest_text/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> + <done-flag>_SUCCESS</done-flag> + </dataset> + <dataset name="webrequest_upload_raw" + frequency="${coord:hours(1)}" + initial-instance="${start_time}" + timezone="Universal"> + <uri-template>${webrequest_raw_data_directory}/webrequest_upload/hourly/${YEAR}/${MONTH}/${DAY}/${HOUR}</uri-template> + <done-flag>_SUCCESS</done-flag> + </dataset> + +</datasets> diff --git a/oozie/webrequest/datasets_refined.xml b/oozie/webrequest/datasets_refined.xml deleted file mode 100644 index 2ac80b8..0000000 --- a/oozie/webrequest/datasets_refined.xml +++ /dev/null @@ -1,51 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- -Defines reusable datasets for refined webrequest data. -Use this dataset in your coordinator.xml files by setting: - - ${start_time} - the initial instance of your data. - Example: 2014-04-01T00:00Z - ${webrequest_refined_data_directory} - Path to directory where refined data is time bucketed. - Example: /wmf/data/wmf/webrequest ---> - -<datasets> - - <!-- - The webrequest_*_refined datasets contain the same data as the - above two 'raw' datasets, except that they use a more efficient - storage format, and contain extra information. - - This dataset does not yet include upload or bits. - - To unpad MONTH, DAY, and HOUR, we force coercion to a number by - adding 0. - - Note that we do not use “${...}” but “${"$"}{...}", as dataset files are - passed to EL twice in cascade, and in the first EL level, ${MONTH} - evaluates to the string “${MONTH}”. Hence, we escape the dollar sign in - “${....}" to “${"$"}{...}”. At the first EL level, “${"$"}” gets turned - into a dollar sign, and “{...}” is just passed along. Hence, we arrive - at “${...}” as input for the second EL level. There, the variables hold - their expected values, and we can start unpadding them. - - TODO: I would like to eventually name this data set 'webrequest_mobile', - etc. and rename the datasets.xml datasets to webrequest_mobile_raw, etc. - --> - <dataset name="webrequest_mobile_refined" - frequency="${coord:hours(1)}" - initial-instance="${start_time}" - timezone="Universal"> - <uri-template>${webrequest_refined_data_directory}/webrequest_source=mobile/year=${YEAR}/month=${"$"}{MONTH + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template> - <done-flag></done-flag> - </dataset> - <dataset name="webrequest_text_refined" - frequency="${coord:hours(1)}" - initial-instance="${start_time}" - timezone="Universal"> - <uri-template>${webrequest_refined_data_directory}/webrequest_source=text/year=${YEAR}/month=${"$"}{MONTH + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template> - <done-flag></done-flag> - - </dataset> - -</datasets> diff --git a/oozie/webrequest/legacy_tsvs/bundle.properties b/oozie/webrequest/legacy_tsvs/bundle.properties index 48efba0..6ef0ea6 100644 --- a/oozie/webrequest/legacy_tsvs/bundle.properties +++ b/oozie/webrequest/legacy_tsvs/bundle.properties @@ -41,10 +41,10 @@ hive_site_xml = ${oozie_directory}/util/hive/hive-site.xml # Table to write hourly pagecounts to (fully qualified) -webrequest_table = wmf_raw.webrequest +webrequest_table = wmf.webrequest # HDFS path to directory where webrequst data is time bucketed. -webrequest_data_directory = ${name_node}/wmf/data/raw/webrequest +webrequest_data_directory = ${name_node}/wmf/data/wmf/webrequest # Temporary directory temporary_directory = ${name_node}/tmp diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml b/oozie/webrequest/legacy_tsvs/bundle.xml index bb1106a..c3270c3 100644 --- a/oozie/webrequest/legacy_tsvs/bundle.xml +++ b/oozie/webrequest/legacy_tsvs/bundle.xml @@ -5,7 +5,7 @@ <parameters> <property> <name>queue_name</name> - <value>adhoc</value> + <value>default</value> </property> <!-- Required properties. --> @@ -16,6 +16,7 @@ <property><name>stop_time</name></property> <property><name>webrequest_datasets_file</name></property> <property><name>webrequest_data_directory</name></property> + <property><name>hive_site_xml</name></property> <property><name>workflow_file</name></property> <property><name>webrequest_table</name></property> diff --git a/oozie/webrequest/legacy_tsvs/coordinator.xml b/oozie/webrequest/legacy_tsvs/coordinator.xml index e24a44f..fa42c25 100644 --- a/oozie/webrequest/legacy_tsvs/coordinator.xml +++ b/oozie/webrequest/legacy_tsvs/coordinator.xml @@ -9,7 +9,7 @@ <parameters> <property> <name>queue_name</name> - <value>adhoc</value> + <value>default</value> </property> <property> diff --git a/oozie/webrequest/legacy_tsvs/workflow.xml b/oozie/webrequest/legacy_tsvs/workflow.xml index 4f4b342..2d463c0 100644 --- a/oozie/webrequest/legacy_tsvs/workflow.xml +++ b/oozie/webrequest/legacy_tsvs/workflow.xml @@ -5,7 +5,7 @@ <parameters> <property> <name>queue_name</name> - <value>adhoc</value> + <value>default</value> </property> <property> <name>aspect_compression_ending</name> diff --git a/oozie/webrequest/load/bundle.properties b/oozie/webrequest/load/bundle.properties index e42954b..bc7205f 100644 --- a/oozie/webrequest/load/bundle.properties +++ b/oozie/webrequest/load/bundle.properties @@ -22,7 +22,7 @@ workflow_file = ${oozie_directory}/webrequest/load/workflow.xml # HDFS path to webrequest dataset definition -datasets_file = ${oozie_directory}/webrequest/datasets.xml +datasets_raw_file = ${oozie_directory}/webrequest/datasets_raw.xml # Initial import time of the webrequest dataset. start_time = 2014-04-01T00:00Z @@ -48,8 +48,8 @@ # Base directory for obviously faulty hosts faulty_hosts_directory = ${name_node}/wmf/data/raw/webrequests_faulty_hosts -# HDFS path to directory where webrequest data is time bucketed. -webrequest_data_directory = ${name_node}/wmf/data/raw/webrequest +# HDFS path to directory where webrequest raw data is time bucketed. +webrequest_raw_data_directory = ${name_node}/wmf/data/raw/webrequest # Coordintator to start. oozie.bundle.application.path = ${oozie_directory}/webrequest/load/bundle.xml diff --git a/oozie/webrequest/load/bundle.xml b/oozie/webrequest/load/bundle.xml index df602b7..b3d2534 100644 --- a/oozie/webrequest/load/bundle.xml +++ b/oozie/webrequest/load/bundle.xml @@ -15,7 +15,7 @@ <property><name>workflow_file</name></property> <property><name>start_time</name></property> <property><name>stop_time</name></property> - <property><name>webrequest_data_directory</name></property> + <property><name>webrequest_raw_data_directory</name></property> <property><name>hive_site_xml</name></property> <property><name>add_partition_workflow_file</name></property> @@ -25,7 +25,7 @@ <property><name>mark_directory_done_workflow_file</name></property> </parameters> - <coordinator name='hive_webrequest_load-webrequest-bits'> + <coordinator name='hive_webrequest_raw_load-webrequest_bits'> <app-path>${coordinator_file}</app-path> <configuration> <property> @@ -35,7 +35,7 @@ </configuration> </coordinator> - <coordinator name='hive_webrequest_load-webrequest-mobile'> + <coordinator name='hive_webrequest_raw_load-webrequest_mobile'> <app-path>${coordinator_file}</app-path> <configuration> <property> @@ -45,7 +45,7 @@ </configuration> </coordinator> - <coordinator name='hive_webrequest_load-webrequest-text'> + <coordinator name='hive_webrequest_raw_load-webrequest_text'> <app-path>${coordinator_file}</app-path> <configuration> <property> @@ -55,7 +55,7 @@ </configuration> </coordinator> - <coordinator name='hive_webrequest_load-webrequest-upload'> + <coordinator name='hive_webrequest_raw_load-webrequest_upload'> <app-path>${coordinator_file}</app-path> <configuration> <property> diff --git a/oozie/webrequest/load/check_sequence_statistics_workflow.xml b/oozie/webrequest/load/check_sequence_statistics_workflow.xml index 3c46bf0..71f03b4 100644 --- a/oozie/webrequest/load/check_sequence_statistics_workflow.xml +++ b/oozie/webrequest/load/check_sequence_statistics_workflow.xml @@ -1,6 +1,6 @@ <?xml version="1.0" encoding="UTF-8"?> <workflow-app xmlns="uri:oozie:workflow:0.4" - name="check_sequence_statistics-webrequest-${webrequest_source},${year},${month},${day},${hour}-wf"> + name="check_sequence_statistics-webrequest_raw-${webrequest_source},${year},${month},${day},${hour}-wf"> <parameters> <property> diff --git a/oozie/webrequest/load/coordinator.xml b/oozie/webrequest/load/coordinator.xml index 77045d9..87cd7b5 100644 --- a/oozie/webrequest/load/coordinator.xml +++ b/oozie/webrequest/load/coordinator.xml @@ -1,6 +1,6 @@ <?xml version="1.0" encoding="UTF-8"?> <coordinator-app xmlns="uri:oozie:coordinator:0.4" - name="hive_webrequest_load-${table}-${webrequest_source}-coord" + name="hive_webrequest_raw_load-${table}-${webrequest_source}-coord" frequency="${coord:hours(1)}" start="${start_time}" end="${stop_time}" @@ -18,7 +18,7 @@ <property><name>workflow_file</name></property> <property><name>start_time</name></property> <property><name>stop_time</name></property> - <property><name>webrequest_data_directory</name></property> + <property><name>webrequest_raw_data_directory</name></property> <property><name>hive_site_xml</name></property> <property><name>add_partition_workflow_file</name></property> @@ -79,7 +79,7 @@ </datasets> <input-events> - <data-in name="input" dataset="webrequest_${webrequest_source}_unchecked"> + <data-in name="input" dataset="webrequest_${webrequest_source}_raw_unchecked"> <instance>${coord:current(0)}</instance> </data-in> <!-- @@ -89,7 +89,7 @@ might get created /before/ writing for the current dataset finishes). --> - <data-in name="ready_indicator" dataset="webrequest_${webrequest_source}_unchecked"> + <data-in name="ready_indicator" dataset="webrequest_${webrequest_source}_raw_unchecked"> <instance>${coord:current(2)}</instance> </data-in> </input-events> diff --git a/oozie/webrequest/load/workflow.xml b/oozie/webrequest/load/workflow.xml index bc2d271..345ddf5 100644 --- a/oozie/webrequest/load/workflow.xml +++ b/oozie/webrequest/load/workflow.xml @@ -1,6 +1,6 @@ <?xml version="1.0" encoding="UTF-8"?> <workflow-app xmlns="uri:oozie:workflow:0.4" - name="hive_webrequest_load-${table}-${webrequest_source},${year},${month},${day},${hour}-wf"> + name="hive_webrequest_raw_load-${table}-${webrequest_source},${year},${month},${day},${hour}-wf"> <parameters> <property> diff --git a/oozie/webrequest/refine/bundle.xml b/oozie/webrequest/refine/bundle.xml index 47b74aa..72048a5 100644 --- a/oozie/webrequest/refine/bundle.xml +++ b/oozie/webrequest/refine/bundle.xml @@ -15,7 +15,8 @@ <property><name>workflow_file</name></property> <property><name>start_time</name></property> <property><name>stop_time</name></property> - <property><name>webrequest_data_directory</name></property> + <property><name>webrequest_raw_data_directory</name></property> + <property><name>datasets_raw_file</name></property> <property><name>hive_site_xml</name></property> <property><name>artifacts_directory</name></property> @@ -37,7 +38,7 @@ </coordinator> --> - <coordinator name='refine-webrequest-mobile'> + <coordinator name='refine-webrequest_mobile'> <app-path>${coordinator_file}</app-path> <configuration> <property> @@ -47,7 +48,7 @@ </configuration> </coordinator> - <coordinator name='refine-webrequest-text'> + <coordinator name='refine-webrequest_text'> <app-path>${coordinator_file}</app-path> <configuration> <property> diff --git a/oozie/webrequest/refine/coordinator.xml b/oozie/webrequest/refine/coordinator.xml index ef44a36..e747846 100644 --- a/oozie/webrequest/refine/coordinator.xml +++ b/oozie/webrequest/refine/coordinator.xml @@ -18,9 +18,9 @@ <property><name>workflow_file</name></property> <property><name>start_time</name></property> <property><name>stop_time</name></property> - <property><name>webrequest_data_directory</name></property> + <property><name>webrequest_raw_data_directory</name></property> + <property><name>datasets_raw_file</name></property> - <property><name>datasets_file</name></property> <property><name>hive_site_xml</name></property> <property><name>artifacts_directory</name></property> <property><name>source_table</name></property> @@ -61,11 +61,11 @@ Include the given datasets_file file. This should define the "webrequest" dataset for this coordinator. --> - <include>${datasets_file}</include> + <include>${datasets_raw_file}</include> </datasets> <input-events> - <data-in name="input" dataset="webrequest_${webrequest_source}"> + <data-in name="input" dataset="webrequest_${webrequest_source}_raw"> <instance>${coord:current(0)}</instance> </data-in> </input-events> -- To view, visit https://gerrit.wikimedia.org/r/184796 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I0285dbbf78014011c48cda53fca3a2fc84635442 Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery Gerrit-Branch: master Gerrit-Owner: Ottomata <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
