[ 
https://issues.apache.org/jira/browse/KYLIN-4344?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Xiaoxiang Yu resolved KYLIN-4344.
---------------------------------
    Resolution: Fixed

> Build Global Dict by MR/Hive, Extract Fact Table Distinct Columns Step
> ----------------------------------------------------------------------
>
>                 Key: KYLIN-4344
>                 URL: https://issues.apache.org/jira/browse/KYLIN-4344
>             Project: Kylin
>          Issue Type: Sub-task
>            Reporter: wangxiaojing
>            Assignee: wangxiaojing
>            Priority: Major
>
> This step is the first step for Build Global Dict by MR/Hive, mainly realizes 
> the following functions by HQL:
>  * Create global dict hive table of the cube if not exists, default 
> globalDictDatabase.cubeName_global_dict;
>  * Create extract distinct value intermediate hive table, default 
> flatTableName__group_by;
>  * Create global dict hive intermediate table for this segment , defalut 
> flatTableName_global_dict;
>  * Insert data into flatTableName__group_by hive table.
> SQL Example
>  
> {code:java}
> //One cube only have only one global_dict table
> CREATE TABLE IF NOT EXISTS db.kylin_sales_cube_mr_notopn_global_dict(  
> dict_key STRING COMMENT '',  dict_val INT COMMENT '' ) COMMENT '' PARTITIONED 
> BY (dict_column string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED 
> AS TEXTFILE; 
> //Each segment build job has a flatTableName__group_by table
> DROP TABLE IF EXISTS
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by;
>  CREATE TABLE IF NOT EXISTS 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
>  (  dict_key STRING COMMENT '' ) COMMENT '' PARTITIONED BY (dict_column 
> string) STORED AS TEXTFILE ;
> //Each segment build job has a flatTableName_global_dict table
> DROP TABLE IF EXISTS 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115_global_dict;
>  CREATE TABLE IF NOT EXISTS 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115_global_dict
>  (  dict_key STRING COMMENT '' ,  dict_val STRING COMMENT '' ) COMMENT '' 
> PARTITIONED BY (dict_column string) ROW FORMAT DELIMITED FIELDS TERMINATED BY 
> '\t' STORED AS TEXTFILE ;
> //Insert data into flatTableName__group_by table
> //global dict columns partitons
> INSERT OVERWRITE TABLE 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
>  PARTITION (dict_column = 'KYLIN_SALES_SELLER_ID') SELECT a.DICT_KEY FROM  
> (SELECT KYLIN_SALES_SELLER_ID  as DICT_KEY FROM 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115 
> GROUP BY KYLIN_SALES_SELLER_ID) a   LEFT JOIN  (SELECT  DICT_KEY FROM 
> db.kylin_sales_cube_mr_notopn_global_dict WHERE DICT_COLUMN = 
> 'KYLIN_SALES_SELLER_ID') b  ON a.DICT_KEY = b.DICT_KEY WHERE   b.DICT_KEY IS 
> NULL ;
> INSERT OVERWRITE TABLE 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
>  PARTITION (dict_column = 'KYLIN_SALES_BUYER_ID') SELECT a.DICT_KEY FROM  
> (SELECT KYLIN_SALES_BUYER_ID  as DICT_KEY FROM 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115 
> GROUP BY KYLIN_SALES_BUYER_ID) a   LEFT JOIN  (SELECT  DICT_KEY FROM 
> db.kylin_sales_cube_mr_notopn_global_dict WHERE DICT_COLUMN = 
> 'KYLIN_SALES_BUYER_ID') b     ON a.DICT_KEY = b.DICT_KEY WHERE b.DICT_KEY IS 
> NULL;
> // KYLIN_MAX_DISTINCT_COUNT special partition
> INSERT OVERWRITE TABLE 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
>  PARTITION (DICT_COLUMN = 'KYLIN_MAX_DISTINCT_COUNT') SELECT  CONCAT_WS(',', 
> tc.dict_column, cast(tc.total_distinct_val AS String), if(tm.max_dict_val is 
> null, '0', cast(max_dict_val as string))) FROM  (SELECT  dict_column,count(1) 
> total_distinct_val FROM 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
>  where DICT_COLUMN != 'KYLIN_MAX_DISTINCT_COUNT' group by dict_column) tc  
> LEFT JOIN  (SELECT  dict_column,if(max(dict_val) is null, 0, max(dict_val)) 
> as max_dict_val FROM db.kylin_sales_cube_mr_notopn_global_dict group by 
> dict_column) tm  ON  tc.dict_column = tm.dict_column;
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to