[ 
https://issues.apache.org/jira/browse/KYLIN-4344?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17030320#comment-17030320
 ] 

ASF GitHub Bot commented on KYLIN-4344:
---------------------------------------

wangxiaojing123 commented on pull request #1090: KYLIN-4344 Build Global Dict 
by MR/Hive, Extract Fact Table Distinct Columns Step
URL: https://github.com/apache/kylin/pull/1090
 
 
   
 
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Build Global Dict by MR/Hive, Extract Fact Table Distinct Columns Step
> ----------------------------------------------------------------------
>
>                 Key: KYLIN-4344
>                 URL: https://issues.apache.org/jira/browse/KYLIN-4344
>             Project: Kylin
>          Issue Type: Sub-task
>            Reporter: wangxiaojing
>            Assignee: wangxiaojing
>            Priority: Major
>
> This step is the first step for Build Global Dict by MR/Hive, mainly realizes 
> the following functions by HQL:
>  * Create global dict hive table of the cube if not exists, default 
> globalDictDatabase.cubeName_global_dict;
>  * Create extract distinct value intermediate hive table, default 
> flatTableName__group_by;
>  * Create global dict hive intermediate table for this segment , defalut 
> flatTableName_global_dict;
>  * Insert data into flatTableName__group_by hive table.
> SQL Example
>  
> {code:java}
> //One cube only have only one global_dict table
> CREATE TABLE IF NOT EXISTS db.kylin_sales_cube_mr_notopn_global_dict(  
> dict_key STRING COMMENT '',  dict_val INT COMMENT '' ) COMMENT '' PARTITIONED 
> BY (dict_column string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED 
> AS TEXTFILE; 
> //Each segment build job has a flatTableName__group_by table
> DROP TABLE IF EXISTS
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by;
>  CREATE TABLE IF NOT EXISTS 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
>  (  dict_key STRING COMMENT '' ) COMMENT '' PARTITIONED BY (dict_column 
> string) STORED AS TEXTFILE ;
> //Each segment build job has a flatTableName_global_dict table
> DROP TABLE IF EXISTS 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115_global_dict;
>  CREATE TABLE IF NOT EXISTS 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115_global_dict
>  (  dict_key STRING COMMENT '' ,  dict_val STRING COMMENT '' ) COMMENT '' 
> PARTITIONED BY (dict_column string) ROW FORMAT DELIMITED FIELDS TERMINATED BY 
> '\t' STORED AS TEXTFILE ;
> //Insert data into flatTableName__group_by table
> //global dict columns partitons
> INSERT OVERWRITE TABLE 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
>  PARTITION (dict_column = 'KYLIN_SALES_SELLER_ID') SELECT a.DICT_KEY FROM  
> (SELECT KYLIN_SALES_SELLER_ID  as DICT_KEY FROM 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115 
> GROUP BY KYLIN_SALES_SELLER_ID) a   LEFT JOIN  (SELECT  DICT_KEY FROM 
> db.kylin_sales_cube_mr_notopn_global_dict WHERE DICT_COLUMN = 
> 'KYLIN_SALES_SELLER_ID') b  ON a.DICT_KEY = b.DICT_KEY WHERE   b.DICT_KEY IS 
> NULL ;
> INSERT OVERWRITE TABLE 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
>  PARTITION (dict_column = 'KYLIN_SALES_BUYER_ID') SELECT a.DICT_KEY FROM  
> (SELECT KYLIN_SALES_BUYER_ID  as DICT_KEY FROM 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115 
> GROUP BY KYLIN_SALES_BUYER_ID) a   LEFT JOIN  (SELECT  DICT_KEY FROM 
> db.kylin_sales_cube_mr_notopn_global_dict WHERE DICT_COLUMN = 
> 'KYLIN_SALES_BUYER_ID') b     ON a.DICT_KEY = b.DICT_KEY WHERE b.DICT_KEY IS 
> NULL;
> // KYLIN_MAX_DISTINCT_COUNT special partition
> INSERT OVERWRITE TABLE 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
>  PARTITION (DICT_COLUMN = 'KYLIN_MAX_DISTINCT_COUNT') SELECT  CONCAT_WS(',', 
> tc.dict_column, cast(tc.total_distinct_val AS String), if(tm.max_dict_val is 
> null, '0', cast(max_dict_val as string))) FROM  (SELECT  dict_column,count(1) 
> total_distinct_val FROM 
> kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by
>  where DICT_COLUMN != 'KYLIN_MAX_DISTINCT_COUNT' group by dict_column) tc  
> LEFT JOIN  (SELECT  dict_column,if(max(dict_val) is null, 0, max(dict_val)) 
> as max_dict_val FROM db.kylin_sales_cube_mr_notopn_global_dict group by 
> dict_column) tm  ON  tc.dict_column = tm.dict_column;
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to