[ https://issues.apache.org/jira/browse/KYLIN-4344?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Xiaoxiang Yu resolved KYLIN-4344. --------------------------------- Resolution: Fixed > Build Global Dict by MR/Hive, Extract Fact Table Distinct Columns Step > ---------------------------------------------------------------------- > > Key: KYLIN-4344 > URL: https://issues.apache.org/jira/browse/KYLIN-4344 > Project: Kylin > Issue Type: Sub-task > Reporter: wangxiaojing > Assignee: wangxiaojing > Priority: Major > > This step is the first step for Build Global Dict by MR/Hive, mainly realizes > the following functions by HQL: > * Create global dict hive table of the cube if not exists, default > globalDictDatabase.cubeName_global_dict; > * Create extract distinct value intermediate hive table, default > flatTableName__group_by; > * Create global dict hive intermediate table for this segment , defalut > flatTableName_global_dict; > * Insert data into flatTableName__group_by hive table. > SQL Example > > {code:java} > //One cube only have only one global_dict table > CREATE TABLE IF NOT EXISTS db.kylin_sales_cube_mr_notopn_global_dict( > dict_key STRING COMMENT '', dict_val INT COMMENT '' ) COMMENT '' PARTITIONED > BY (dict_column string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED > AS TEXTFILE; > //Each segment build job has a flatTableName__group_by table > DROP TABLE IF EXISTS > kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by; > CREATE TABLE IF NOT EXISTS > kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by > ( dict_key STRING COMMENT '' ) COMMENT '' PARTITIONED BY (dict_column > string) STORED AS TEXTFILE ; > //Each segment build job has a flatTableName_global_dict table > DROP TABLE IF EXISTS > kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115_global_dict; > CREATE TABLE IF NOT EXISTS > kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115_global_dict > ( dict_key STRING COMMENT '' , dict_val STRING COMMENT '' ) COMMENT '' > PARTITIONED BY (dict_column string) ROW FORMAT DELIMITED FIELDS TERMINATED BY > '\t' STORED AS TEXTFILE ; > //Insert data into flatTableName__group_by table > //global dict columns partitons > INSERT OVERWRITE TABLE > kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by > PARTITION (dict_column = 'KYLIN_SALES_SELLER_ID') SELECT a.DICT_KEY FROM > (SELECT KYLIN_SALES_SELLER_ID as DICT_KEY FROM > kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115 > GROUP BY KYLIN_SALES_SELLER_ID) a LEFT JOIN (SELECT DICT_KEY FROM > db.kylin_sales_cube_mr_notopn_global_dict WHERE DICT_COLUMN = > 'KYLIN_SALES_SELLER_ID') b ON a.DICT_KEY = b.DICT_KEY WHERE b.DICT_KEY IS > NULL ; > INSERT OVERWRITE TABLE > kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by > PARTITION (dict_column = 'KYLIN_SALES_BUYER_ID') SELECT a.DICT_KEY FROM > (SELECT KYLIN_SALES_BUYER_ID as DICT_KEY FROM > kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115 > GROUP BY KYLIN_SALES_BUYER_ID) a LEFT JOIN (SELECT DICT_KEY FROM > db.kylin_sales_cube_mr_notopn_global_dict WHERE DICT_COLUMN = > 'KYLIN_SALES_BUYER_ID') b ON a.DICT_KEY = b.DICT_KEY WHERE b.DICT_KEY IS > NULL; > // KYLIN_MAX_DISTINCT_COUNT special partition > INSERT OVERWRITE TABLE > kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by > PARTITION (DICT_COLUMN = 'KYLIN_MAX_DISTINCT_COUNT') SELECT CONCAT_WS(',', > tc.dict_column, cast(tc.total_distinct_val AS String), if(tm.max_dict_val is > null, '0', cast(max_dict_val as string))) FROM (SELECT dict_column,count(1) > total_distinct_val FROM > kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by > where DICT_COLUMN != 'KYLIN_MAX_DISTINCT_COUNT' group by dict_column) tc > LEFT JOIN (SELECT dict_column,if(max(dict_val) is null, 0, max(dict_val)) > as max_dict_val FROM db.kylin_sales_cube_mr_notopn_global_dict group by > dict_column) tm ON tc.dict_column = tm.dict_column; > {code} -- This message was sent by Atlassian Jira (v8.3.4#803005)