[
https://issues.apache.org/jira/browse/KYLIN-4344?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
wangxiaojing updated KYLIN-4344:
--------------------------------
Description:
This step is the first step for Build Global Dict by MR/Hive, mainly realizes
the following functions by HQL:
* Create global dict hive table of the cube if not exists, default
globalDictDatabase.cubeName_global_dict;
* Create extract distinct value intermediate hive table, default
flatTableName__group_by;
* Create global dict hive intermediate table for this segment , defalut
flatTableName_global_dict;
* Insert data into flatTableName__group_by hive table.
SQL Example
{code:java}
//One cube only have only one global_dict table
CREATE TABLE IF NOT EXISTS db.kylin_sales_cube_mr_notopn_global_dict( dict_key
STRING COMMENT '', dict_val INT COMMENT '' ) COMMENT '' PARTITIONED BY
(dict_column string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS
TEXTFILE;
//Each segment build job has a flatTableName__group_by table
DROP TABLE IF EXISTS
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by;
CREATE TABLE IF NOT EXISTS
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
( dict_key STRING COMMENT '' ) COMMENT '' PARTITIONED BY (dict_column string)
STORED AS TEXTFILE ;
//Each segment build job has a flatTableName_global_dict table
DROP TABLE IF EXISTS
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115_global_dict;
CREATE TABLE IF NOT EXISTS
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115_global_dict
( dict_key STRING COMMENT '' , dict_val STRING COMMENT '' ) COMMENT ''
PARTITIONED BY (dict_column string) ROW FORMAT DELIMITED FIELDS TERMINATED BY
'\t' STORED AS TEXTFILE ;
//Insert data into flatTableName__group_by table
//global dict columns partitons
INSERT OVERWRITE TABLE
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
PARTITION (dict_column = 'KYLIN_SALES_SELLER_ID') SELECT a.DICT_KEY FROM
(SELECT KYLIN_SALES_SELLER_ID as DICT_KEY FROM
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115
GROUP BY KYLIN_SALES_SELLER_ID) a LEFT JOIN (SELECT DICT_KEY FROM
db.kylin_sales_cube_mr_notopn_global_dict WHERE DICT_COLUMN =
'KYLIN_SALES_SELLER_ID') b ON a.DICT_KEY = b.DICT_KEY WHERE b.DICT_KEY IS
NULL ;INSERT OVERWRITE TABLE
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
PARTITION (dict_column = 'KYLIN_SALES_BUYER_ID') SELECT a.DICT_KEY FROM
(SELECT KYLIN_SALES_BUYER_ID as DICT_KEY FROM
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115
GROUP BY KYLIN_SALES_BUYER_ID) a LEFT JOIN (SELECT DICT_KEY FROM
db.kylin_sales_cube_mr_notopn_global_dict WHERE DICT_COLUMN =
'KYLIN_SALES_BUYER_ID') b ON a.DICT_KEY = b.DICT_KEY WHERE b.DICT_KEY IS
NULL;
// KYLIN_MAX_DISTINCT_COUNT special partition
INSERT OVERWRITE TABLE
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
PARTITION (DICT_COLUMN = 'KYLIN_MAX_DISTINCT_COUNT') SELECT CONCAT_WS(',',
tc.dict_column, cast(tc.total_distinct_val AS String), if(tm.max_dict_val is
null, '0', cast(max_dict_val as string))) FROM (SELECT dict_column,count(1)
total_distinct_val FROM
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
where DICT_COLUMN != 'KYLIN_MAX_DISTINCT_COUNT' group by dict_column) tc LEFT
JOIN (SELECT dict_column,if(max(dict_val) is null, 0, max(dict_val)) as
max_dict_val FROM db.kylin_sales_cube_mr_notopn_global_dict group by
dict_column) tm ON tc.dict_column = tm.dict_column;
{code}
was:
This step is the first step for Build Global Dict by MR/Hive, mainly realizes
the following functions by HQL:
* Create global dict hive table of the cube if not exists, default
globalDictDatabase.cubeName_global_dict;
* Create extract distinct value intermediate hive table, default
flatTableName__group_by;
* Create global dict hive intermediate table for this segment , defalut
flatTableName_global_dict;
* Insert data into flatTableName__group_by hive table.
SQL Example
{code:java}
//One cube only have only one global_dict table
CREATE TABLE IF NOT EXISTS db.kylin_sales_cube_mr_notopn_global_dict( dict_key
STRING COMMENT '', dict_val INT COMMENT '' ) COMMENT '' PARTITIONED BY
(dict_column string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS
TEXTFILE;
//Each segment build job has a flatTableName__group_by tableDROP TABLE IF
EXISTS
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by;
CREATE TABLE IF NOT EXISTS
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
( dict_key STRING COMMENT '' ) COMMENT '' PARTITIONED BY (dict_column string)
STORED AS TEXTFILE ;
//Each segment build job has a flatTableName_global_dict tableDROP TABLE IF
EXISTS
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115_global_dict;
CREATE TABLE IF NOT EXISTS
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115_global_dict
( dict_key STRING COMMENT '' , dict_val STRING COMMENT '' ) COMMENT ''
PARTITIONED BY (dict_column string) ROW FORMAT DELIMITED FIELDS TERMINATED BY
'\t' STORED AS TEXTFILE ;
//Insert data into flatTableName__group_by table
//global dict columns partitons
INSERT OVERWRITE TABLE
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
PARTITION (dict_column = 'KYLIN_SALES_SELLER_ID') SELECT a.DICT_KEY FROM
(SELECT KYLIN_SALES_SELLER_ID as DICT_KEY FROM
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115
GROUP BY KYLIN_SALES_SELLER_ID) a LEFT JOIN (SELECT DICT_KEY FROM
db.kylin_sales_cube_mr_notopn_global_dict WHERE DICT_COLUMN =
'KYLIN_SALES_SELLER_ID') b ON a.DICT_KEY = b.DICT_KEY WHERE b.DICT_KEY IS
NULL ;INSERT OVERWRITE TABLE
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
PARTITION (dict_column = 'KYLIN_SALES_BUYER_ID') SELECT a.DICT_KEY FROM
(SELECT KYLIN_SALES_BUYER_ID as DICT_KEY FROM
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115
GROUP BY KYLIN_SALES_BUYER_ID) a LEFT JOIN (SELECT DICT_KEY FROM
db.kylin_sales_cube_mr_notopn_global_dict WHERE DICT_COLUMN =
'KYLIN_SALES_BUYER_ID') b ON a.DICT_KEY = b.DICT_KEY WHERE b.DICT_KEY IS
NULL;
// KYLIN_MAX_DISTINCT_COUNT special partition
INSERT OVERWRITE TABLE
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
PARTITION (DICT_COLUMN = 'KYLIN_MAX_DISTINCT_COUNT') SELECT CONCAT_WS(',',
tc.dict_column, cast(tc.total_distinct_val AS String), if(tm.max_dict_val is
null, '0', cast(max_dict_val as string))) FROM (SELECT dict_column,count(1)
total_distinct_val FROM
kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
where DICT_COLUMN != 'KYLIN_MAX_DISTINCT_COUNT' group by dict_column) tc LEFT
JOIN (SELECT dict_column,if(max(dict_val) is null, 0, max(dict_val)) as
max_dict_val FROM db.kylin_sales_cube_mr_notopn_global_dict group by
dict_column) tm ON tc.dict_column = tm.dict_column;
{code}
> Build Global Dict by MR/Hive, Extract Fact Table Distinct Columns Step
> ----------------------------------------------------------------------
>
> Key: KYLIN-4344
> URL: https://issues.apache.org/jira/browse/KYLIN-4344
> Project: Kylin
> Issue Type: Sub-task
> Reporter: wangxiaojing
> Assignee: wangxiaojing
> Priority: Major
>
> This step is the first step for Build Global Dict by MR/Hive, mainly realizes
> the following functions by HQL:
> * Create global dict hive table of the cube if not exists, default
> globalDictDatabase.cubeName_global_dict;
> * Create extract distinct value intermediate hive table, default
> flatTableName__group_by;
> * Create global dict hive intermediate table for this segment , defalut
> flatTableName_global_dict;
> * Insert data into flatTableName__group_by hive table.
> SQL Example
>
> {code:java}
> //One cube only have only one global_dict table
> CREATE TABLE IF NOT EXISTS db.kylin_sales_cube_mr_notopn_global_dict(
> dict_key STRING COMMENT '', dict_val INT COMMENT '' ) COMMENT '' PARTITIONED
> BY (dict_column string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED
> AS TEXTFILE;
> //Each segment build job has a flatTableName__group_by table
> DROP TABLE IF EXISTS
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by;
> CREATE TABLE IF NOT EXISTS
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
> ( dict_key STRING COMMENT '' ) COMMENT '' PARTITIONED BY (dict_column
> string) STORED AS TEXTFILE ;
> //Each segment build job has a flatTableName_global_dict table
> DROP TABLE IF EXISTS
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115_global_dict;
> CREATE TABLE IF NOT EXISTS
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115_global_dict
> ( dict_key STRING COMMENT '' , dict_val STRING COMMENT '' ) COMMENT ''
> PARTITIONED BY (dict_column string) ROW FORMAT DELIMITED FIELDS TERMINATED BY
> '\t' STORED AS TEXTFILE ;
> //Insert data into flatTableName__group_by table
> //global dict columns partitons
> INSERT OVERWRITE TABLE
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
> PARTITION (dict_column = 'KYLIN_SALES_SELLER_ID') SELECT a.DICT_KEY FROM
> (SELECT KYLIN_SALES_SELLER_ID as DICT_KEY FROM
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115
> GROUP BY KYLIN_SALES_SELLER_ID) a LEFT JOIN (SELECT DICT_KEY FROM
> db.kylin_sales_cube_mr_notopn_global_dict WHERE DICT_COLUMN =
> 'KYLIN_SALES_SELLER_ID') b ON a.DICT_KEY = b.DICT_KEY WHERE b.DICT_KEY IS
> NULL ;INSERT OVERWRITE TABLE
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
> PARTITION (dict_column = 'KYLIN_SALES_BUYER_ID') SELECT a.DICT_KEY FROM
> (SELECT KYLIN_SALES_BUYER_ID as DICT_KEY FROM
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115
> GROUP BY KYLIN_SALES_BUYER_ID) a LEFT JOIN (SELECT DICT_KEY FROM
> db.kylin_sales_cube_mr_notopn_global_dict WHERE DICT_COLUMN =
> 'KYLIN_SALES_BUYER_ID') b ON a.DICT_KEY = b.DICT_KEY WHERE b.DICT_KEY IS
> NULL;
> // KYLIN_MAX_DISTINCT_COUNT special partition
> INSERT OVERWRITE TABLE
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
> PARTITION (DICT_COLUMN = 'KYLIN_MAX_DISTINCT_COUNT') SELECT CONCAT_WS(',',
> tc.dict_column, cast(tc.total_distinct_val AS String), if(tm.max_dict_val is
> null, '0', cast(max_dict_val as string))) FROM (SELECT dict_column,count(1)
> total_distinct_val FROM
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
> where DICT_COLUMN != 'KYLIN_MAX_DISTINCT_COUNT' group by dict_column) tc
> LEFT JOIN (SELECT dict_column,if(max(dict_val) is null, 0, max(dict_val))
> as max_dict_val FROM db.kylin_sales_cube_mr_notopn_global_dict group by
> dict_column) tm ON tc.dict_column = tm.dict_column;
> {code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)